diff --git a/stb_image.h b/stb_image.h
index a632d5435..1061973f3 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -787,6 +787,16 @@ static int stbi__sse2_available(void)
 #endif
 #endif
 
+// LOONGARCH LSX
+#if defined(STBI_NO_SIMD) && defined(STBI_LSX)
+#undef STBI_LSX
+#endif
+
+#ifdef STBI_LSX
+#include <lsxintrin.h>
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
 #endif
@@ -2911,6 +2921,228 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 
 #endif // STBI_NEON
 
+#ifdef STBI_LSX
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
+    // This is constructed to match our regular (generic) integer IDCT exactly.
+    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+    __m128i tmp, tmp1, tmp2;
+
+
+    tmp = __lsx_vreplgr2vr_w(0);
+    tmp1 = __lsx_vreplgr2vr_w(0);
+    tmp2 = __lsx_vreplgr2vr_w(0);
+// dot product constant: even elems=x, odd elems=y
+#define dct_const(x, y) __lsx_vpackev_h(__lsx_vreplgr2vr_h((y)), __lsx_vreplgr2vr_h((x)))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0, out1, x, y, c0, c1, tmp1, tmp2)                                                                                      \
+    __m128i c0##lo = __lsx_vilvl_h((y), (x));                                                                             \
+    __m128i c0##hi = __lsx_vilvh_h((y), (x));                                                                             \
+    tmp1 = __lsx_vreplgr2vr_w(0);         \
+    tmp2 = __lsx_vreplgr2vr_w(0);         \
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, c0##lo, c0);\
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, c0##lo, c0); \
+    __m128i out0##_l = __lsx_vadd_w(tmp1, tmp2);                                                                             \
+    tmp1 = __lsx_vreplgr2vr_w(0);         \
+    tmp2 = __lsx_vreplgr2vr_w(0);         \
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, c0##hi, c0);\
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, c0##hi, c0); \
+    __m128i out0##_h = __lsx_vadd_w(tmp1, tmp2);                                                                             \
+    tmp1 = __lsx_vreplgr2vr_w(0);         \
+    tmp2 = __lsx_vreplgr2vr_w(0);         \
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, c0##lo, c1);\
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, c0##lo, c1); \
+    __m128i out1##_l = __lsx_vadd_w(tmp1, tmp2);                                                                             \
+    tmp1 = __lsx_vreplgr2vr_w(0);         \
+    tmp2 = __lsx_vreplgr2vr_w(0);         \
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, c0##hi, c1);\
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, c0##hi, c1); \
+    __m128i out1##_h = __lsx_vadd_w(tmp1, tmp2);                                                                             \
+
+// out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in)                                                                                                     \
+    __m128i out##_l = __lsx_vsrai_w(__lsx_vilvl_h((in), __lsx_vreplgr2vr_d(0)), 4);                                        \
+    __m128i out##_h = __lsx_vsrai_w(__lsx_vilvh_h((in), __lsx_vreplgr2vr_d(0)), 4)
+
+// wide add
+#define dct_wadd(out, a, b)                                                                                                    \
+    __m128i out##_l = __lsx_vadd_w(a##_l, b##_l);                                                                             \
+    __m128i out##_h = __lsx_vadd_w(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b)                                                                                                    \
+    __m128i out##_l = __lsx_vsub_w(a##_l, b##_l);                                                                             \
+    __m128i out##_h = __lsx_vsub_w(a##_h, b##_h)
+
+// butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, bias, s, tmp1, tmp2)                                                                                 \
+    {                                                                                                                          \
+        __m128i abiased_l = __lsx_vadd_w(a##_l, bias);                                                                        \
+        __m128i abiased_h = __lsx_vadd_w(a##_h, bias);                                                                        \
+        dct_wadd(sum, abiased, b);                                                                                             \
+        dct_wsub(dif, abiased, b);                                                                                             \
+        tmp1 = __lsx_vsat_w(__lsx_vsrai_w(sum_l, s), 15);                                            \
+        tmp2 = __lsx_vsat_w(__lsx_vsrai_w(sum_h, s), 15);                                            \
+        out0 = __lsx_vpickev_h(tmp2, tmp1);                                            \
+        tmp1 = __lsx_vsat_w(__lsx_vsrai_w(dif_l, s), 15);                                            \
+        tmp2 = __lsx_vsat_w(__lsx_vsrai_w(dif_h, s), 15);                                            \
+        out1 = __lsx_vpickev_h(tmp2, tmp1);                                            \
+    }
+
+// 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b)                                                                                                  \
+    tmp = a;                                                                                                                   \
+    a = __lsx_vilvl_b(b, a);                                                                                               \
+    b = __lsx_vilvh_b(b, tmp)
+
+// 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b)                                                                                                 \
+    tmp = a;                                                                                                                   \
+    a = __lsx_vilvl_h(b, a);                                                                                              \
+    b = __lsx_vilvh_h(b, tmp)
+
+#define dct_pass(bias, shift)                                                                                                  \
+    {                                                                                                                          \
+        /* even part */                                                                                                        \
+        dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1, tmp1, tmp2);                                                                         \
+        __m128i sum04 = __lsx_vadd_h(row0, row4);                                                                             \
+        __m128i dif04 = __lsx_vsub_h(row0, row4);                                                                             \
+        dct_widen(t0e, sum04);                                                                                                 \
+        dct_widen(t1e, dif04);                                                                                                 \
+        dct_wadd(x0, t0e, t3e);                                                                                                \
+        dct_wsub(x3, t0e, t3e);                                                                                                \
+        dct_wadd(x1, t1e, t2e);                                                                                                \
+        dct_wsub(x2, t1e, t2e);                                                                                                \
+        /* odd part */                                                                                                         \
+        dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1, tmp1, tmp2);                                                                         \
+        dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1, tmp1, tmp2);                                                                         \
+        __m128i sum17 = __lsx_vadd_h(row1, row7);                                                                             \
+        __m128i sum35 = __lsx_vadd_h(row3, row5);                                                                             \
+        dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1, tmp1, tmp2);                                                                       \
+        dct_wadd(x4, y0o, y4o);                                                                                                \
+        dct_wadd(x5, y1o, y5o);                                                                                                \
+        dct_wadd(x6, y2o, y5o);                                                                                                \
+        dct_wadd(x7, y3o, y4o);                                                                                                \
+        dct_bfly32o(row0, row7, x0, x7, bias, shift, tmp1, tmp2);                                                                          \
+        dct_bfly32o(row1, row6, x1, x6, bias, shift, tmp1, tmp2);                                                                          \
+        dct_bfly32o(row2, row5, x2, x5, bias, shift, tmp1, tmp2);                                                                          \
+        dct_bfly32o(row3, row4, x3, x4, bias, shift, tmp1, tmp2);                                                                          \
+    }
+
+    __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+    __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f), stbi__f2f(0.5411961f));
+    __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+    __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+    __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f), stbi__f2f(-1.961570560f));
+    __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
+    __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f), stbi__f2f(-0.390180644f));
+    __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
+
+    // rounding biases in column/row passes, see stbi__idct_block for explanation.
+     __m128i bias_0 = __lsx_vreplgr2vr_w(512);
+     __m128i bias_1 = __lsx_vreplgr2vr_w(65536 + (128<<17));
+
+    // load
+    row0 = __lsx_vld((const __m128i *)(data + 0 * 8), 0);
+    row1 = __lsx_vld((const __m128i *)(data + 1 * 8), 0);
+    row2 = __lsx_vld((const __m128i *)(data + 2 * 8), 0);
+    row3 = __lsx_vld((const __m128i *)(data + 3 * 8), 0);
+    row4 = __lsx_vld((const __m128i *)(data + 4 * 8), 0);
+    row5 = __lsx_vld((const __m128i *)(data + 5 * 8), 0);
+    row6 = __lsx_vld((const __m128i *)(data + 6 * 8), 0);
+    row7 = __lsx_vld((const __m128i *)(data + 7 * 8), 0);
+
+    // column pass
+    dct_pass(bias_0, 10);
+
+    {
+        // 16bit 8x8 transpose pass 1
+        dct_interleave16(row0, row4);
+        dct_interleave16(row1, row5);
+        dct_interleave16(row2, row6);
+        dct_interleave16(row3, row7);
+
+        // transpose pass 2
+        dct_interleave16(row0, row2);
+        dct_interleave16(row1, row3);
+        dct_interleave16(row4, row6);
+        dct_interleave16(row5, row7);
+
+        // transpose pass 3
+        dct_interleave16(row0, row1);
+        dct_interleave16(row2, row3);
+        dct_interleave16(row4, row5);
+        dct_interleave16(row6, row7);
+    }
+
+    // row pass
+    dct_pass(bias_1, 17);
+
+    {
+        // pack
+        tmp1 = __lsx_vsat_hu(row0, 7);
+        tmp2 = __lsx_vsat_hu(row1, 7);
+        __m128i p0 = __lsx_vpickev_b(tmp2, tmp1); // a0a1a2a3...a7b0b1b2b3...b7
+
+        tmp1 = __lsx_vsat_hu(row2, 7);
+        tmp2 = __lsx_vsat_hu(row3, 7);
+        __m128i p1 = __lsx_vpickev_b(tmp2, tmp1);
+
+        tmp1 = __lsx_vsat_hu(row4, 7);
+        tmp2 = __lsx_vsat_hu(row5, 7);
+        __m128i p2 = __lsx_vpickev_b(tmp2, tmp1);
+
+        tmp1 = __lsx_vsat_hu(row6, 7);
+        tmp2 = __lsx_vsat_hu(row7, 7);
+        __m128i p3 = __lsx_vpickev_b(tmp2, tmp1);
+
+        // 8bit 8x8 transpose pass 1
+        dct_interleave8(p0, p2); // a0e0a1e1...
+        dct_interleave8(p1, p3); // c0g0c1g1...
+
+        // transpose pass 2
+        dct_interleave8(p0, p1); // a0c0e0g0...
+        dct_interleave8(p2, p3); // b0d0f0h0...
+
+        // transpose pass 3
+        dct_interleave8(p0, p2); // a0b0c0d0...
+        dct_interleave8(p1, p3); // a4b4c4d4...
+
+        // store
+        *(unsigned long *)out = __lsx_vpickve2gr_d(p0, 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(__lsx_vshuf4i_w(p0, 0x4e), 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(p2, 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(__lsx_vshuf4i_w(p2, 0x4e), 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(p1, 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(__lsx_vshuf4i_w(p1, 0x4e), 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(p3, 0);
+        out += out_stride;
+        *(unsigned long *)out = __lsx_vpickve2gr_d(__lsx_vshuf4i_w(p3, 0x4e), 0);
+    }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_LSX
+
 #define STBI__MARKER_none  0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
@@ -3525,7 +3757,7 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
    return out;
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_LSX)
 static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 {
    // need to generate 2x2 samples for every one in input
@@ -3618,6 +3850,58 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
       o.val[0] = vqrshrun_n_s16(even, 4);
       o.val[1] = vqrshrun_n_s16(odd,  4);
       vst2_u8(out + i*2, o);
+
+#elif defined(STBI_LSX)
+        // load and perform the vertical filtering pass
+        // this uses 3*x + y = 4*x + (y - x)
+        __m128i zero = __lsx_vldi(0);
+        __m128i farb = __lsx_vldi(0);
+        farb = __lsx_vinsgr2vr_d(farb, __lsx_vpickve2gr_d(__lsx_vld((__m128i *)(in_far + i), 0), 0), 0);
+        __m128i nearb = __lsx_vldi(0);
+        nearb = __lsx_vinsgr2vr_d(nearb, __lsx_vpickve2gr_d(__lsx_vld((__m128i *)(in_near + i), 0), 0), 0);
+        __m128i farw = __lsx_vilvl_b(zero, farb);
+        __m128i nearw = __lsx_vilvl_b(zero, nearb);
+        __m128i diff = __lsx_vsub_h(farw, nearw);
+        __m128i nears = __lsx_vslli_h(nearw, 2);
+        __m128i curr = __lsx_vadd_h(nears, diff); // current row
+
+        // horizontal filter works the same based on shifted vers of current
+        // row. "prev" is current row shifted right by 1 pixel; we need to
+        // insert the previous pixel value (from t1).
+        // "next" is current row shifted left by 1 pixel, with first pixel
+        // of next block of 8 pixels added in.
+        __m128i prv0 = __lsx_vbsll_v(curr, 2);
+        __m128i nxt0 = __lsx_vbsrl_v(curr, 2);
+        __m128i prev = __lsx_vinsgr2vr_h(prv0, t1, 0);
+        __m128i next = __lsx_vinsgr2vr_h(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+
+        // horizontal filter, polyphase implementation since it's convenient:
+        // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+        // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+        // note the shared term.
+        __m128i bias = __lsx_vldi(0);
+        bias = __lsx_vinsgr2vr_h(bias, 8, 0);
+        __m128i curs = __lsx_vslli_h(curr, 2);
+        __m128i prvd = __lsx_vsub_h(prev, curr);
+        __m128i nxtd = __lsx_vsub_h(next, curr);
+        __m128i curb = __lsx_vadd_h(curs, bias);
+        __m128i even = __lsx_vadd_h(prvd, curb);
+        __m128i odd = __lsx_vadd_h(nxtd, curb);
+
+        // interleave even and odd pixels, then undo scaling.
+        __m128i int0 = __lsx_vilvl_h(odd, even);
+        __m128i int1 = __lsx_vilvh_h(odd, even);
+        __m128i de0 = __lsx_vsrli_h(int0, 4);
+        __m128i de1 = __lsx_vsrli_h(int1, 4);
+
+        // pack and write output
+        __m128i tmp1, tmp2;
+        tmp1 = __lsx_vmax_h(zero, de0);
+        tmp1 = __lsx_vsat_h(tmp1, 7);
+        tmp2 = __lsx_vmax_h(zero, de1);
+        tmp2 = __lsx_vsat_h(tmp2, 7);
+        __m128i outv = __lsx_vpickev_b(tmp2, tmp1);
+        __lsx_vst(outv, (__m128i *)(out + i * 2), 0);
 #endif
 
       // "previous" value for next iter
@@ -3681,7 +3965,7 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
    }
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_LSX)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
 {
    int i = 0;
@@ -3793,6 +4077,86 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
    }
 #endif
 
+#ifdef STBI_LSX
+    // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+    // it's useful in practice (you wouldn't use it for textures, for example).
+    // so just accelerate step == 4 case.
+    if (step == 4) {
+        // this is a fairly straightforward implementation and not super-optimized.
+        __m128i signflip = __lsx_vldi(0);
+        __lsx_vinsgr2vr_b(signflip, (-0x80), 0);
+        __m128i cr_const0 = __lsx_vldi(0);
+        __lsx_vinsgr2vr_h(cr_const0, (short)(1.40200f * 4096.0f + 0.5f), 0);
+        __m128i cr_const1 = __lsx_vldi(0);
+        __lsx_vinsgr2vr_h(cr_const1, (-(short)(0.71414f * 4096.0f + 0.5f)), 0);
+        __m128i cb_const0 = __lsx_vldi(0);
+        __lsx_vinsgr2vr_h(cb_const0, (-(short)(0.34414f * 4096.0f + 0.5f)), 0);
+        __m128i cb_const1 = __lsx_vldi(0);
+        __lsx_vinsgr2vr_h(cb_const1, ((short)(1.77200f * 4096.0f + 0.5f)), 0);
+        __m128i y_bias = __lsx_vldi(0);
+        __lsx_vinsgr2vr_b(y_bias, ((char)(unsigned char)128), 0);
+        __m128i xw = __lsx_vldi(0);
+        __lsx_vinsgr2vr_h(xw, (255), 0); // alpha channel
+
+        for (; i + 7 < count; i += 8) {
+            // load
+            __m128i y_bytes = __lsx_vldi(0);
+            __lsx_vinsgr2vr_d(y_bytes, *(unsigned long*)(y + i), 0);
+            __m128i cr_bytes = __lsx_vldi(0);
+            __lsx_vinsgr2vr_d(cr_bytes, *(unsigned long*)(pcr + i), 0);
+            __m128i cb_bytes = __lsx_vldi(0);
+            __lsx_vinsgr2vr_d(cb_bytes, *(unsigned long*)(pcb + i), 0);
+            __m128i cr_biased = __lsx_vxor_v(cr_bytes, signflip); // -128
+            __m128i cb_biased = __lsx_vxor_v(cb_bytes, signflip); // -128
+
+            // unpack to short (and left-shift cr, cb by 8)
+            __m128i yw = __lsx_vilvl_b(y_bytes ,y_bias);
+            __m128i crw = __lsx_vilvl_b(cr_biased, __lsx_vldi(0));
+            __m128i cbw = __lsx_vilvl_b(cb_biased, __lsx_vldi(0));
+
+            // color transform
+            __m128i yws = __lsx_vsrli_h(yw, 4);
+            __m128i cr0 = __lsx_vmuh_h(cr_const0, crw);
+            __m128i cb0 = __lsx_vmuh_h(cb_const0, cbw);
+            __m128i cb1 = __lsx_vmuh_h(cbw, cb_const1);
+            __m128i cr1 = __lsx_vmuh_h(crw, cr_const1);
+            __m128i rws = __lsx_vadd_h(cr0, yws);
+            __m128i gwt = __lsx_vadd_h(cb0, yws);
+            __m128i bws = __lsx_vadd_h(yws, cb1);
+            __m128i gws = __lsx_vadd_h(gwt, cr1);
+
+            // descale
+            __m128i rw = __lsx_vsrai_h(rws, 4);
+            __m128i bw = __lsx_vsrai_h(bws, 4);
+            __m128i gw = __lsx_vsrai_h(gws, 4);
+
+            // back to byte, set up for transpose
+            __m128i tmp1, tmp2, vzero = __lsx_vldi(0);
+           tmp1 = __lsx_vmax_h(vzero, rw);
+           tmp1 = __lsx_vsat_hu(tmp1, 7);
+           tmp2 = __lsx_vmax_h(vzero, bw);
+           tmp2 = __lsx_vsat_hu(tmp2, 7);
+            __m128i brb = __lsx_vpickev_b(tmp2, tmp1);
+           tmp1 = __lsx_vmax_h(vzero, gw);
+           tmp1 = __lsx_vsat_hu(tmp1, 7);
+           tmp2 = __lsx_vmax_h(vzero, xw);
+           tmp2 = __lsx_vsat_hu(tmp2, 7);
+            __m128i gxb = __lsx_vpickev_b(tmp2, tmp1);
+
+            // transpose to interleave channels
+            __m128i t0 = __lsx_vilvl_b(gxb, brb);
+            __m128i t1 = __lsx_vilvh_b(gxb, brb);
+            __m128i o0 = __lsx_vilvl_h(t1, t0);
+            __m128i o1 = __lsx_vilvh_h(t1, t0);
+
+            // store
+            __lsx_vst(o0, (__m128i *)(out + 0), 0);
+            __lsx_vst(o1, (__m128i *)(out + 16), 0);
+            out += 32;
+        }
+    }
+#endif
+
    for (; i < count; ++i) {
       int y_fixed = (y[i] << 20) + (1<<19); // rounding
       int r,g,b;
@@ -3836,6 +4200,12 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
+
+#ifdef STBI_LSX
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
 }
 
 // clean up the temporary component buffers
diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 1cd379a72..200f50813 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -2071,7 +2071,215 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   }
 
   #define STBIR_SIMD
+#elif defined(STBIR_LSX)
+  #include <lsxintrin.h>
 
+  #define stbir__simdf __m128
+  #define stbir__simdi __m128i
+
+  typedef union
+  {
+    int32_t i;
+    float f;
+  } FloatInt;
+static __m128 __lsx_vreplfr2vr_s(float val)
+{
+    FloatInt fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+  #define stbir_simdi_castf( reg ) (__m128i)(reg)
+  #define stbir_simdf_casti( reg ) (__m128)(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = ( (__m128)__lsx_vld( (float const*)(ptr), 0 ) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = __lsx_vld( (stbir__simdi const*)(ptr), 0 )
+  #define stbir__simdf_load1( out, ptr ) (out) = (__m128)__lsx_vreplfr2vr_s( *((float const*)(ptr)) )
+  #define stbir__simdi_load1( out, ptr ) (out) = ( (__m128i) __lsx_vreplgr2vr_s( *((uint32_t const*)(ptr)) ) )
+  #define stbir__simdf_load1z( out, ptr ) (out) = (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), *((uint32_t *)(ptr)), 0) // top values must be zero
+  #define stbir__simdf_frep4( fvar ) (__m128)__lsx_vreplfr2vr_s( (fvar) )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = (__m128)__lsx_vreplfr2vr_s( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) =  (__m128)__lsx_vreplgr2vr_d( *(uint64_t*)(ptr)) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = (__m128)__lsx_vinsgr2vr_d(__lsx_vldi(0), *(uint64_t*)(ptr), 0) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = (__m128)__lsx_vinsgr2vr_d(reg, *(uint64_t*)(ptr), 1)
+
+  #define stbir__simdf_zeroP() (__m128)__lsx_vldi(0)
+  #define stbir__simdf_zero( reg ) (reg) = (__m128)__lsx_vldi(0)
+
+  #define stbir__simdf_store( ptr, reg )  __lsx_vst((__m128i)(reg), (ptr), 0)
+  #define stbir__simdf_store1( ptr, reg ) __lsx_vstelm_w((__m128i)(reg), (ptr), 0, 0)
+  #define stbir__simdf_store2( ptr, reg ) __lsx_vstelm_d((__m128i)(reg), (ptr), 0, 0)
+  #define stbir__simdf_store2h( ptr, reg ) __lsx_vstelm_d((__m128i)(reg), (ptr), 0, 1)
+
+  #define stbir__simdi_store( ptr, reg )  __lsx_vst((__m128i)(reg), (ptr), 0)
+  #define stbir__simdi_store1( ptr, reg ) __lsx_vstelm_w((__m128i)(reg), (ptr), 0, 0)
+  #define stbir__simdi_store2( ptr, reg ) __lsx_vstelm_d((__m128i)(reg), (ptr), 0, 0)
+
+  #define stbir__prefetch( ptr ) __builtin_prefetch((char*)ptr, 0, 3)
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    stbir__simdi zero = __lsx_vldi(0); \
+    out2 = __lsx_vilvl_b(zero, ireg); \
+    out3 = __lsx_vilvh_b(zero, ireg); \
+    out0 = __lsx_vilvl_h(zero, out2); \
+    out1 = __lsx_vilvh_h(zero, out2); \
+    out2 = __lsx_vilvl_h(zero, out3); \
+    out3 = __lsx_vilvh_h(zero, out3); \
+  }
+
+#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    stbir__simdi zero = __lsx_vldi(0); \
+    out = __lsx_vilvl_b( zero, ireg ); \
+    out = __lsx_vilvl_h( zero, out ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    stbir__simdi zero = __lsx_vldi(0); \
+    out0 = __lsx_vilvl_h( zero, ireg ); \
+    out1 = __lsx_vilvh_h( zero, ireg ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = __lsx_vftintrz_w_s(f)
+  #define stbir__simdf_convert_float_to_int( f ) __lsx_pickve2gr_w(__lsx_vftintrz_w_s(f), 0)
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)__lsx_vpickve2gr_w(__lsx_vftintrz_w_s(__lsx_vfmax_s(__lsx_vfmin_s(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),(stbir__simdf)__lsx_vldi(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)__lsx_vpickve2gr_w(__lsx_vftintrz_w_s(__lsx_vfmax_s(__lsx_vfmin_s(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),(stbir__simdf)__lsx_vldi(0))), 0))
+
+  #define stbir__simdi_to_int( i ) __lsx_vpickve2gr_w(i, 0)
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = __lsx_vffint_s_w( ireg )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = __lsx_vfadd_s( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = __lsx_vfmul_s( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = __lsx_vfmul_s( reg, (stbir__simdf)__lsx_vld( (ptr), 0 ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = (__m128)__lsx_vextrins_w((stbir__simdi)(reg), (__m128i)__lsx_vfmul_s( reg, (__m128)__lsx_vldrepl_w( (ptr), 0 ) ), 0)
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = __lsx_vfadd_s( reg, (stbir__simdf)__lsx_vld( (ptr), 0 ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = (__m128)__lsx_vextrins_w((stbir__simdi)(reg), (stbir__simdi)__lsx_vfadd_s( (reg), (stbir__simdf)__lsx_vldrepl_w( (ptr), 0 ) ), 0)
+
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = __lsx_vfadd_s(__lsx_vfmul_s(mul1, mul2), add)
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = (stbir__simdf)__lsx_vextrins_w((stbir__simdi)add, (stbir__simdi)__lsx_vfadd_s(__lsx_vfmul_s(mul1, mul2), add), 0)
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = __lsx_vfadd_s(__lsx_vfmul_s(mul, (stbir__simdf)__lsx_vld((ptr), 0)), add)
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = (stbir__simdf)__lsx_vextrins_w((stbir__simdi)add, (stbir__simdi)__lsx_vfadd_s(__lsx_vfmul_s(mul, (stbir__simdf)__lsx_vldrepl_w((ptr), 0)), add), 0)
+  
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = (stbir__simdf)__lsx_vextrins_w((stbir__simdi)reg0, (stbir__simdi)__lsx_vfadd_s( reg0, reg1 ), 0)
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = (stbir__simdf)__lsx_vextrins_w((stbir__simdi)reg0, (stbir__simdi)__lsx_vfmul_s( reg0, reg1 ), 0)
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = (__m128)__lsx_vand_v( (__m128i)reg0, (__m128i)reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = (__m128)__lsx_vor_v( (__m128i)reg0, (__m128i)reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = __lsx_vfmin_s( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = __lsx_vfmax_s( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = (stbir__simdf)__lsx_vextrins_w((__m128i)reg0, (__m128i)__lsx_vfmin_s( reg0, reg1 ), 0)
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = (stbir__simdf)__lsx_vextrins_w((__m128i)reg0, (__m128i)__lsx_vfmax_s( reg0, reg1 ), 0)
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=(stbir__simdf)( __lsx_vshuf4i_w( __lsx_vpermi_w( (stbir__simdi)reg0, (stbir__simdi)reg1, (0<<0) + (1<<2) + (2<<4) + (3<<6) ), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=(stbir__simdf)( __lsx_vshuf4i_w( __lsx_vpermi_w( (stbir__simdi)reg0, (stbir__simdi)reg1, (0<<0) + (1<<2) + (2<<4) + (3<<6) ), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
+
+  static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
+  static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
+  #define stbir__simdf_aaa1( out, alp, ones ) (out) = (stbir__simdf)__lsx_vshuf4i_w( __lsx_vilvh_d( (stbir__simdi)ones, (stbir__simdi)alp ), (1<<0) + (1<<2) + (1<<4) + (2<<6) )
+  #define stbir__simdf_1aaa( out, alp, ones ) (out) = (stbir__simdf)__lsx_vshuf4i_w( __lsx_vilvl_d( (stbir__simdi)alp, (stbir__simdi)ones ), (0<<0) + (2<<2) + (2<<4) + (2<<6) )
+  #define stbir__simdf_a1a1( out, alp, ones) (out) = (stbir__simdf)__lsx_vor_v( ( __lsx_vslli_d( (stbir__simdi)(alp), 32 ) ), (__m128i)STBIR_zeroones )
+  #define stbir__simdf_1a1a( out, alp, ones) (out) = (stbir__simdf)__lsx_vor_v( ( __lsx_vslli_d( (stbir__simdi)(alp), 32 ) ), (__m128i)STBIR_onezeros )
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) (stbir__simdf)( __lsx_vshuf4i_w( (stbir__simdi)( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = __lsx_vand_v( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = __lsx_vor_v( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = __lsx_vadd_w(__lsx_vmulwev_w_h(reg0, reg1), __lsx_vmulwod_w_h(reg0, reg1));
+  
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    stbir__simdf af,bf; \
+    stbir__simdi a,b; \
+    af = __lsx_vfmin_s( aa, STBIR_max_uint8_as_float ); \
+    bf = __lsx_vfmin_s( bb, STBIR_max_uint8_as_float ); \
+    af = __lsx_vfmax_s( af, (__m128)__lsx_vldi(0) ); \
+    bf = __lsx_vfmax_s( bf, (__m128)__lsx_vldi(0) ); \
+    a = __lsx_vftintrz_w_s( af ); \
+    b = __lsx_vftintrz_w_s( bf ); \
+    a = __lsx_vpickev_h( __lsx_vsat_w(b, 15), __lsx_vsat_w(a, 15) ); \
+    out = __lsx_vpickev_b( __lsx_vsat_hu( __lsx_vmax_h(__lsx_vldi(0), a), 7), __lsx_vsat_hu( __lsx_vmax_h(__lsx_vldi(0), a),7) ); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+      stbir__simdf_load( o0, (ptr) );    \
+      stbir__simdf_load( o1, (ptr)+4 );  \
+      stbir__simdf_load( o2, (ptr)+8 );  \
+      stbir__simdf_load( o3, (ptr)+12 ); \
+      {                                  \
+        __m128 tmp0, tmp1, tmp2, tmp3;   \
+        tmp0 = (__m128)__lsx_vilvl_w((__m128i)o1, (__m128i)o0);  \
+        tmp2 = (__m128)__lsx_vilvl_w((__m128i)o3, (__m128i)o2);  \
+        tmp1 = (__m128)__lsx_vilvh_w((__m128i)o1, (__m128i)o0);  \
+        tmp3 = (__m128)__lsx_vilvh_w((__m128i)o3, (__m128i)o2);  \
+        o0 = (__m128)__lsx_vilvl_d((__m128i)tmp2, (__m128i)tmp0);  \
+        o1 = (__m128)__lsx_vilvh_d((__m128i)tmp2, (__m128i)tmp0);  \
+        o2 = (__m128)__lsx_vilvl_d((__m128i)tmp3, (__m128i)tmp1);  \
+        o3 = (__m128)__lsx_vilvh_d((__m128i)tmp3, (__m128i)tmp1);  \
+      }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+      r0 = __lsx_vpickev_h( __lsx_vsat_w(r1, 15), __lsx_vsat_w(r0, 15) ); \
+      r2 = __lsx_vpickev_h( __lsx_vsat_w(r3, 15), __lsx_vsat_w(r2, 15) ); \
+      r1 = __lsx_vilvl_h( r2, r0 ); \
+      r3 = __lsx_vilvh_h( r2, r0 ); \
+      r0 = __lsx_vilvl_h( r3, r1 ); \
+      r2 = __lsx_vilvh_h( r3, r1 ); \
+      r0 = __lsx_vpickev_b( __lsx_vsat_hu( __lsx_vmax_h(__lsx_vldi(0), r2), 7), __lsx_vsat_hu( __lsx_vmax_h(__lsx_vldi(0), r0),7) ); \
+      stbir__simdi_store( ptr, r0 ); \
+  
+  #define stbir__simdi_32shr( out, reg, imm ) out = __lsx_vsrli_w( (__m128i)reg, imm )
+
+  #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
+  #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
+
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+  STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+  
+  #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
+      { \
+      stbir__simdi tmp0,tmp1; \
+      tmp0 = __lsx_vftintrz_w_s(__lsx_vfmax_s(__lsx_vfmin_s(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),(stbir__simdf)__lsx_vldi(0))); \
+      tmp1 = __lsx_vftintrz_w_s(__lsx_vfmax_s(__lsx_vfmin_s(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),(stbir__simdf)__lsx_vldi(0))); \
+      tmp0 = __lsx_vsub_w( tmp0, stbir__s32_32768 ); \
+      tmp1 = __lsx_vsub_w( tmp1, stbir__s32_32768 ); \
+      out = __lsx_vpickev_h( __lsx_vsat_w(tmp1, 15), __lsx_vsat_w(tmp0, 15) ); \
+      out = __lsx_vsub_h( out, stbir__s16_32768 ); \
+      }
+
+  #define STBIR_SIMD
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
+  {
+    FloatInt fi;
+    __m128 f = __lsx_vreplfr2vr_s(x);
+    __m128 t = __lsx_vfrintrz_s(f);
+    __m128 r = __lsx_vfadd_s(t, (__m128)__lsx_vand_v((__m128i)__lsx_vfcmp_clt_s(f, t), (__m128i)__lsx_vreplfr2vr_s(-1.0f)));
+    fi.i = __lsx_vpickve2gr_w((__m128i)r, 0);
+    return fi.f;
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
+  {
+    FloatInt fi;
+    __m128 f = __lsx_vreplfr2vr_s(x);
+    __m128 t = __lsx_vfrintrz_s(f);
+    __m128 r = __lsx_vfadd_s(t, (__m128)__lsx_vand_v((__m128i)__lsx_vfcmp_clt_s(t, f), (__m128i)__lsx_vreplfr2vr_s(1.0f)));
+    fi.i = __lsx_vpickve2gr_w((__m128i)r, 0);
+    return fi.f;
+  }
 #endif  // SSE2/NEON/WASM
 
 #endif // NO SIMD
@@ -2457,7 +2665,142 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   {
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
   }
+#elif defined(STBIR_LSX)
 
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+  stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
+    static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
+    static const STBIR__SIMDI_CONST(infinity,         0x7c00);
+    static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
+    static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
+
+    __m128i i = __lsx_vld ( (__m128i const*)(input), 0 );
+    __m128i h = __lsx_vilvl_h ( __lsx_vldi(0), i );
+    __m128i mnosign     = STBIR__CONSTI(mask_nosign);
+    __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
+    __m128i smallest    = STBIR__CONSTI(smallest_normal);
+    __m128i infty       = STBIR__CONSTI(infinity);
+    __m128i expmant     = __lsx_vand_v(mnosign, h);
+    __m128i justsign    = __lsx_vxor_v(h, expmant);
+    __m128i b_notinfnan = __lsx_vsle_w( expmant, infty );
+    __m128i b_isdenorm  = __lsx_vsle_w( expmant, smallest );
+    __m128i shifted     = __lsx_vslli_w(expmant, 13);
+    __m128i adj_infnan  = __lsx_vandn_v(b_notinfnan, eadjust);
+    __m128i adjusted    = __lsx_vadd_w(eadjust, shifted);
+    __m128i den1        = __lsx_vadd_w(shifted, STBIR__CONSTI(magic_denorm));
+    __m128i adjusted2   = __lsx_vadd_w(adjusted, adj_infnan);
+    __m128  den2        = __lsx_vfsub_s((__m128)(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = (__m128)__lsx_vand_v((__m128i)den2, (__m128i)(b_isdenorm));
+    __m128  adjusted4   = (__m128)__lsx_vandn_v((__m128i)b_isdenorm, (__m128i)adjusted2);
+    __m128  adjusted5   = (__m128)__lsx_vor_v((__m128i)adjusted3, (__m128i)adjusted4);
+    __m128i sign        = __lsx_vslli_w(justsign, 16);
+    __m128  final       = (__m128)__lsx_vor_v((__m128i)adjusted5, sign);
+    stbir__simdf_store( output + 0,  final );
+
+    h = __lsx_vilvh_h ( __lsx_vldi(0), i );
+    expmant     = __lsx_vand_v(mnosign, h);
+    justsign    = __lsx_vxor_v(h, expmant);
+    b_notinfnan = __lsx_vsle_w( expmant, infty );
+    b_isdenorm  = __lsx_vsle_w( expmant, smallest );
+    shifted     = __lsx_vslli_w(expmant, 13);
+    adj_infnan  = __lsx_vandn_v(b_notinfnan, eadjust);
+    adjusted    = __lsx_vadd_w(eadjust, shifted);
+    den1        = __lsx_vadd_w(shifted, STBIR__CONSTI(magic_denorm));
+    adjusted2   = __lsx_vadd_w(adjusted, adj_infnan);
+    den2        = __lsx_vfsub_s((__m128)(den1), *(const __m128 *)&magic_denorm);
+    adjusted3   = (__m128)__lsx_vand_v((__m128i)den2, (__m128i)(b_isdenorm));
+    adjusted4   = (__m128)__lsx_vandn_v((__m128i)b_isdenorm, (__m128i)adjusted2);
+    adjusted5   = (__m128)__lsx_vor_v((__m128i)adjusted3, (__m128i)adjusted4);
+    sign        = __lsx_vslli_w(justsign, 16);
+    final       = (__m128)__lsx_vor_v((__m128i)adjusted5, (__m128i)sign);
+    stbir__simdf_store( output + 4,  final );
+
+    // ~38 SSE2 ops for 8 values
+  }
+
+  // Fabian's round-to-nearest-even float to half
+  // ~48 SSE2 ops for 8 output
+  stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
+    static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
+    static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
+    static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
+    static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
+    static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
+    static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
+
+    __m128  f           =  (__m128)__lsx_vld(input, 0);
+    __m128  msign       = (__m128)(STBIR__CONSTI(mask_sign));
+    __m128  justsign    = (__m128)__lsx_vand_v((__m128i)msign, (__m128i)f);
+    __m128  absf        = (__m128)__lsx_vxor_v((__m128i)f, (__m128i)justsign);
+    __m128i absf_int    = (__m128i)(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    __m128i f16max      = STBIR__CONSTI(c_f16max);
+    __m128  b_isnan     = (__m128)__lsx_vfcmp_cune_s(absf, absf); // is this a NaN?
+    __m128i b_isregular = __lsx_vsle_w(absf_int, f16max); // (sub)normalized or special?
+    __m128i nanbit      = __lsx_vand_v((__m128i)(b_isnan), STBIR__CONSTI(c_nanbit));
+    __m128i inf_or_nan  = __lsx_vor_v(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    __m128i min_normal  = STBIR__CONSTI(c_min_normal);
+    __m128i b_issub     = __lsx_vsle_w(absf_int, min_normal);
+
+    // "result is subnormal" path
+    __m128  subnorm1    = __lsx_vfadd_s(absf, (__m128)(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    __m128i subnorm2    = __lsx_vsub_w((__m128i)(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    __m128i mantoddbit  = __lsx_vslli_w(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    __m128i mantodd     = __lsx_vsrai_w(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    __m128i round1      = __lsx_vadd_w(absf_int, STBIR__CONSTI(c_normal_bias));
+    __m128i round2      = __lsx_vsub_w(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    __m128i normal      = __lsx_vsrli_w(round2, 13); // rounded result
+
+    // combine the two non-specials
+    __m128i nonspecial  = __lsx_vor_v(__lsx_vand_v(subnorm2, b_issub), __lsx_vandn_v(b_issub, normal));
+
+    // merge in specials as well
+    __m128i joined      = __lsx_vor_v(__lsx_vand_v(nonspecial, b_isregular), __lsx_vandn_v(b_isregular, inf_or_nan));
+
+    __m128i sign_shift  = __lsx_vsrai_w((__m128i)(justsign), 16);
+    __m128i final2, final= __lsx_vor_v(joined, sign_shift);
+
+    f           =  (__m128)__lsx_vld(input+4, 0);
+    justsign    = (__m128)__lsx_vand_v((__m128i)msign, (__m128i)f);
+    absf        = (__m128)__lsx_vxor_v((__m128i)f, (__m128i)justsign);
+    absf_int    = (__m128i)(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    b_isnan     = (__m128)__lsx_vfcmp_cune_s(absf, absf); // is this a NaN?
+    b_isregular = __lsx_vsle_w (absf_int, f16max); // (sub)normalized or special?
+    nanbit      = __lsx_vand_v((__m128i)(b_isnan), c_nanbit);
+    inf_or_nan  = __lsx_vor_v(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    b_issub     = __lsx_vsle_w(absf_int, min_normal);
+
+    // "result is subnormal" path
+    subnorm1    = __lsx_vfadd_s(absf, (__m128)(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    subnorm2    = __lsx_vsub_w((__m128i)(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    mantoddbit  = __lsx_vslli_w(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    mantodd     = __lsx_vsrai_w(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    round1      = __lsx_vadd_w(absf_int, STBIR__CONSTI(c_normal_bias));
+    round2      = __lsx_vsub_w(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    normal      = __lsx_vsrli_w(round2, 13); // rounded result
+
+    // combine the two non-specials
+    nonspecial  = __lsx_vor_v(__lsx_vand_v(subnorm2, b_issub), __lsx_vandn_v(b_issub, normal));
+
+    // merge in specials as well
+    joined      = __lsx_vor_v(__lsx_vand_v(nonspecial, b_isregular), __lsx_vandn_v(b_isregular, inf_or_nan));
+
+    sign_shift  = __lsx_vsrai_w((__m128i)(justsign), 16);
+    final2      = __lsx_vor_v(joined, sign_shift);
+    final       = __lsx_vpickev_h(__lsx_vsat_w(final2, 15), __lsx_vsat_w(final, 15));
+    stbir__simdi_store( output,final );
+  }
 #endif
 
 
@@ -2648,7 +2991,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #ifdef STBIR_PROFILE
 
-#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ ) || defined( __loongarch__ )
 
 #ifdef _MSC_VER
 
diff --git a/tests/image_test.c b/tests/image_test.c
index 9c216cf96..7238301b5 100644
--- a/tests/image_test.c
+++ b/tests/image_test.c
@@ -3,6 +3,7 @@
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "stb_image_write.h"
 
+#define STBI_LSX 1
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"