From d0cd52370633224a37b8b6b298ce9e8a8ff4790c Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Fri, 8 Dec 2023 13:50:13 +0800
Subject: [PATCH 01/10] riscv int8 convolution

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 .../riscv/convolution_3x3_winograd_int8.h     | 1620 +++++++++++++++++
 .../riscv/convolution_im2col_gemm_int8.h      |  953 ++++++++++
 src/layer/riscv/convolution_packed_int8.h     |  398 ++++
 src/layer/riscv/convolution_riscv.cpp         |  145 +-
 src/layer/riscv/convolution_riscv.h           |    8 +
 5 files changed, 3121 insertions(+), 3 deletions(-)
 create mode 100644 src/layer/riscv/convolution_3x3_winograd_int8.h
 create mode 100644 src/layer/riscv/convolution_im2col_gemm_int8.h
 create mode 100644 src/layer/riscv/convolution_packed_int8.h
diff --git a/src/layer/riscv/convolution_3x3_winograd_int8.h b/src/layer/riscv/convolution_3x3_winograd_int8.h
new file mode 100644
index 000000000000..9dda146b1338
--- /dev/null
+++ b/src/layer/riscv/convolution_3x3_winograd_int8.h
@@ -0,0 +1,1620 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
+{
+    const int N = max_kk * batch;
+
+    for (int b = 0; b < batch; b++)
+    {
+        short* pp = AT.row<short>(b);
+
+        int ii = 0;
+        for (; ii + 1 < max_ii; ii += 2)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[batch];
+                pp[2] = p0[N];
+                pp[3] = p0[N + batch];
+                p0 += batch * 2;
+                pp += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[N];
+                p0 += batch;
+                pp += 2;
+            }
+        }
+        for (; ii < max_ii; ii++)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                p0 += batch;
+                pp += 1;
+            }
+        }
+    }
+}
+
+static void transpose_pack_B_tile_int8(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT)
+{
+    #pragma omp parallel for num_threads(nT)
+    for (int b = 0; b < batch; b++)
+    {
+        short* pp = BT.row<short>(b);
+
+        int jj = 0;
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp[2] = p0[2];
+                pp[3] = p0[3];
+                p0 += max_jj * batch * 2;
+                pp += 4;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                p0 += max_jj * batch;
+                pp += 2;
+            }
+        }
+        for (; jj < max_jj; jj++)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                p0 += max_jj * batch * 2;
+                pp += 2;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                p0 += max_jj * batch;
+                pp += 1;
+            }
+        }
+    }
+}
+
+static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk, bool k_end)
+{
+    int* outptr = top_blob;
+
+    int ii = 0;
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                int sum00 = 0;
+                int sum01 = 0;
+                int sum10 = 0;
+                int sum11 = 0;
+
+                if (k == 0)
+                {
+                    sum00 = 0;
+                    sum01 = 0;
+                    sum10 = 0;
+                    sum11 = 0;
+                }
+                else
+                {
+                    sum00 = outptr[0];
+                    sum01 = outptr[1];
+                    sum10 = outptr[2];
+                    sum11 = outptr[3];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum00 += pA[0] * pB[0];
+                    sum00 += pA[1] * pB[1];
+                    sum01 += pA[2] * pB[0];
+                    sum01 += pA[3] * pB[1];
+                    sum10 += pA[0] * pB[2];
+                    sum10 += pA[1] * pB[3];
+                    sum11 += pA[2] * pB[2];
+                    sum11 += pA[3] * pB[3];
+
+                    pA += 4;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum00 += pA[0] * pB[0];
+                    sum01 += pA[1] * pB[0];
+                    sum10 += pA[0] * pB[1];
+                    sum11 += pA[1] * pB[1];
+                    pA += 2;
+                    pB += 2;
+                }
+
+                outptr[0] = sum00;
+                outptr[1] = sum01;
+                outptr[2] = sum10;
+                outptr[3] = sum11;
+                outptr += 2 * 2;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                if (k == 0)
+                {
+                    sum0 = 0;
+                    sum1 = 0;
+                }
+                else
+                {
+                    sum0 = outptr[0];
+                    sum1 = outptr[1];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum0 += pA[1] * pB[1];
+                    sum1 += pA[2] * pB[0];
+                    sum1 += pA[3] * pB[1];
+                    pA += 4;
+                    pB += 2;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum1 += pA[1] * pB[0];
+                    pA += 2;
+                    pB += 1;
+                }
+
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+                outptr += 2;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                if (k == 0)
+                {
+                    sum0 = 0;
+                    sum1 = 0;
+                }
+                else
+                {
+                    sum0 = outptr[0];
+                    sum1 = outptr[1];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum0 += pA[1] * pB[1];
+                    sum1 += pA[0] * pB[2];
+                    sum1 += pA[1] * pB[3];
+                    pA += 2;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum1 += pA[0] * pB[1];
+                    pA += 1;
+                    pB += 2;
+                }
+
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+                outptr += 2;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                int sum = 0;
+
+                if (k == 0)
+                {
+                    sum = 0;
+                }
+                else
+                {
+                    sum = outptr[0];
+                }
+
+                int kk = 0;
+                for (; kk < max_kk; kk++)
+                {
+                    sum += pA[0] * pB[0];
+                    pA += 1;
+                    pB += 1;
+                }
+
+                outptr[0] = sum;
+                outptr += 1;
+            }
+        }
+    }
+}
+
+static void get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
+{
+    // resolve optimal tile size from cache size
+    const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(short));
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
+    // solve M
+    {
+        int tile_size = (int)sqrt((float)l2_cache_size_int8 / 3);
+
+        TILE_M = std::max(2, tile_size / 2 * 2);
+
+        TILE_M *= std::min(nT, get_physical_cpu_count());
+
+        int nn_M = (M + TILE_M - 1) / TILE_M;
+
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
+
+        if (nT > 1)
+        {
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
+        }
+    }
+
+    // solve K
+    {
+        int tile_size = (int)(sqrt((float)l2_cache_size_int8) - TILE_M);
+
+        TILE_K = std::max(2, tile_size / 2 * 2);
+
+        int nn_K = (K + TILE_K - 1) / TILE_K;
+
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
+    }
+
+    if (N > 0)
+    {
+        int tile_size = (int)((l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 2 + TILE_K));
+
+        TILE_N = std::max(1, tile_size);
+
+        int nn_N = (N + TILE_N - 1) / TILE_N;
+
+        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
+{
+    // const signed char ktm[4][3] = {
+    //     {2, 0, 0},
+    //     {1, 1, 1},
+    //     {1, -1, 1},
+    //     {0, 0, 2}
+    // };
+
+    short* ptmp = A;
+
+    int ii = 0;
+    for (; ii < max_ii; ii++)
+    {
+        int kk = 0;
+        for (; kk < max_kk; kk++)
+        {
+            short tmp[4][3];
+
+            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;
+
+            for (int m = 0; m < 3; m++)
+            {
+                signed char r0 = k0[0];
+                signed char r1 = k0[1];
+                signed char r2 = k0[2];
+
+                tmp[0][m] = r0 * 2;
+                tmp[1][m] = r0 + r1 + r2;
+                tmp[2][m] = r0 - r1 + r2;
+                tmp[3][m] = r2 * 2;
+
+                k0 += 3;
+            }
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+
+                short z0 = r0 * 2;
+                short z1 = r0 + r1 + r2;
+                short z2 = r0 - r1 + r2;
+                short z3 = r2 * 2;
+
+                ptmp[0] = z0;
+                ptmp[1] = z1;
+                ptmp[2] = z2;
+                ptmp[3] = z3;
+                ptmp += 4;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
+{
+    const int M = outch;
+    const int K = inch;
+    const int B = 16;
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0);
+
+    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat A_tile = A_tileX.channel(get_omp_thread_num());
+
+        for (int k = 0; k < K; k += TILE_K)
+        {
+            const int max_ii = std::min((M - i), TILE_M);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            conv3x3s1_winograd23_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);
+
+            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
+{
+    // const signed char itm[4][4] = {
+    //     {1,  0, -1,  0},
+    //     {0,  1,  1,  0},
+    //     {0, -1,  1,  0},
+    //     {0, -1,  0,  1}
+    // };
+
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int elempack = bottom_blob.elempack;
+    const int N = bottom_blob.cstep * elempack;
+
+    const int w_tiles = (w - 1) / 2;
+
+    int nn_max_kk = 0;
+    int remain_max_kk_start = 0;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+    #pragma omp parallel for num_threads(nT)
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 2;
+
+        short tmp[4][4][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 4; m++)
+            {
+                signed char r00 = 0;
+                signed char r01 = 0;
+                signed char r10 = 0;
+                signed char r11 = 0;
+                signed char r20 = 0;
+                signed char r21 = 0;
+                signed char r30 = 0;
+                signed char r31 = 0;
+
+                if (ti * 2 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        r00 = r0[0];
+                        r01 = r1[0];
+                        if (tj * 2 + 1 < w)
+                        {
+                            r10 = r0[1];
+                            r11 = r1[1];
+                        }
+                        if (tj * 2 + 2 < w)
+                        {
+                            r20 = r0[2];
+                            r21 = r1[2];
+                        }
+                        if (tj * 2 + 3 < w)
+                        {
+                            r30 = r0[3];
+                            r31 = r1[3];
+                        }
+                    }
+                }
+
+                tmp[0][m][0] = r00 - r20;
+                tmp[0][m][1] = r01 - r21;
+                tmp[1][m][0] = r10 + r20;
+                tmp[1][m][1] = r11 + r21;
+                tmp[2][m][0] = r20 - r10;
+                tmp[2][m][1] = r21 - r11;
+                tmp[3][m][0] = r30 - r10;
+                tmp[3][m][1] = r31 - r11;
+
+                r0 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj * 2;
+            short* p1 = p0 + max_jj * 2;
+            short* p2 = p0 + max_jj * 2 * 2;
+            short* p3 = p0 + max_jj * 2 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r00 = tmp[m][0][0];
+                short r01 = tmp[m][0][1];
+                short r10 = tmp[m][1][0];
+                short r11 = tmp[m][1][1];
+                short r20 = tmp[m][2][0];
+                short r21 = tmp[m][2][1];
+                short r30 = tmp[m][3][0];
+                short r31 = tmp[m][3][1];
+
+                p0[0] = r00 - r20;
+                p0[1] = r01 - r21;
+                p1[0] = r10 + r20;
+                p1[1] = r11 + r21;
+                p2[0] = r20 - r10;
+                p2[1] = r21 - r11;
+                p3[0] = r30 - r10;
+                p3[1] = r31 - r11;
+
+                p0 += max_jj * 4 * 2;
+                p1 += max_jj * 4 * 2;
+                p2 += max_jj * 4 * 2;
+                p3 += max_jj * 4 * 2;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 2;
+    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
+    {
+        short tmp[4][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 4; m++)
+            {
+                signed char r0 = 0;
+                signed char r1 = 0;
+                signed char r2 = 0;
+                signed char r3 = 0;
+
+                if (ti * 2 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        r0 = r0123[0];
+                        if (tj * 2 + 1 < w) r1 = r0123[1];
+                        if (tj * 2 + 2 < w) r2 = r0123[2];
+                        if (tj * 2 + 3 < w) r3 = r0123[3];
+                    }
+                }
+
+                tmp[0][m] = r0 - r2;
+                tmp[1][m] = r1 + r2;
+                tmp[2][m] = r2 - r1;
+                tmp[3][m] = r3 - r1;
+
+                r0123 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj;
+            short* p1 = p0 + max_jj;
+            short* p2 = p0 + max_jj * 2;
+            short* p3 = p0 + max_jj * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+                short r3 = tmp[m][3];
+
+                p0[0] = r0 - r2;
+                p1[0] = r1 + r2;
+                p2[0] = r2 - r1;
+                p3[0] = r3 - r1;
+
+                p0 += max_jj * 4;
+                p1 += max_jj * 4;
+                p2 += max_jj * 4;
+                p3 += max_jj * 4;
+            }
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
+{
+    // const int otm[2][4] = {
+    //     {1,  1,  1,  0},
+    //     {0,  1, -1,  1}
+    // };
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int N = top_blob.cstep * out_elempack;
+
+    const int w_tiles = (outw + 1) / 2;
+
+    int ii = 0;
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        int tmp[2][4][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 2;
+            const int* r1 = r0 + max_jj * 2;
+            const int* r2 = r0 + max_jj * 2 * 2;
+            const int* r3 = r0 + max_jj * 2 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
+                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
+                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
+                tmp[1][m][1] = r1[1] - r2[1] + r3[1];
+
+                r0 += max_jj * 4 * 2;
+                r1 += max_jj * 4 * 2;
+                r2 += max_jj * 4 * 2;
+                r3 += max_jj * 4 * 2;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+                int tmp00 = tmp[m][0][0] + tmp[m][1][0] + tmp[m][2][0];
+                int tmp01 = tmp[m][0][1] + tmp[m][1][1] + tmp[m][2][1];
+                int tmp10 = tmp[m][1][0] - tmp[m][2][0] + tmp[m][3][0];
+                int tmp11 = tmp[m][1][1] - tmp[m][2][1] + tmp[m][3][1];
+
+                tmp00 = tmp00 >> 2;
+                tmp01 = tmp01 >> 2;
+                tmp10 = tmp10 >> 2;
+                tmp11 = tmp11 >> 2;
+
+                // if (out_elempack == 1)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    outptr0[0] = tmp00;
+                    outptr1[0] = tmp01;
+                    if (tj * 2 + 1 < outw)
+                    {
+                        outptr0[1] = tmp10;
+                        outptr1[1] = tmp11;
+                    }
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        int tmp[2][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj;
+            const int* r1 = r0 + max_jj;
+            const int* r2 = r0 + max_jj * 2;
+            const int* r3 = r0 + max_jj * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                tmp[0][m] = r0[0] + r1[0] + r2[0];
+                tmp[1][m] = r1[0] - r2[0] + r3[0];
+
+                r0 += max_jj * 4;
+                r1 += max_jj * 4;
+                r2 += max_jj * 4;
+                r3 += max_jj * 4;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+                int tmp0 = tmp[m][0] + tmp[m][1] + tmp[m][2];
+                int tmp1 = tmp[m][1] - tmp[m][2] + tmp[m][3];
+
+                tmp0 = tmp0 >> 2;
+                tmp1 = tmp1 >> 2;
+
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = tmp0;
+                    if (tj * 2 + 1 < outw) outptr0[1] = tmp1;
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
+{
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    // pad to 2n+2, winograd F(2,3)
+    int w_tiles = (outw + 1) / 2;
+    int h_tiles = (outh + 1) / 2;
+    int tiles = w_tiles * h_tiles;
+
+    const int M = top_blob.c * top_blob.elempack;
+    const int N = tiles;
+    const int K = bottom_blob.c * bottom_blob.elempack;
+    const int B = 16;
+
+    // NCNN_LOGE("conv3x3s1_winograd23_int8 %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+    const int nn_N = (N + TILE_N - 1) / TILE_N;
+    const int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
+
+    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    if (nT > 1 && nn_NK < nT)
+    {
+        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
+
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            // transform input
+            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
+        }
+    }
+    else
+    {
+        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(nT)
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            Mat B_tile = B_tileX.channel(get_omp_thread_num());
+
+            // transform input
+            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
+        }
+    }
+
+    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat top_tile = top_tileX.channel(get_omp_thread_num());
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+                bool k_end = k + TILE_K >= K;
+
+                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end);
+            }
+
+            // transform output
+            conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
+{
+    // const short ktm[6][3] = {
+    //     {6, 0, 0},
+    //     {-4, -4, -4},
+    //     {-4, 4, -4},
+    //     {1, 2, 4},
+    //     {1, -2, 4},
+    //     {0, 0, 6}
+    // };
+
+    short* ptmp = A;
+
+    int ii = 0;
+    for (; ii < max_ii; ii++)
+    {
+        int kk = 0;
+        for (; kk < max_kk; kk++)
+        {
+            short tmp[6][3];
+
+            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;
+
+            for (int m = 0; m < 3; m++)
+            {
+                signed char r0 = k0[0];
+                signed char r1 = k0[1];
+                signed char r2 = k0[2];
+
+                tmp[0][m] = r0 * 6;
+                tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4;
+                tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4;
+                tmp[3][m] = r0 + r1 * 2 + r2 * 4;
+                tmp[4][m] = r0 - r1 * 2 + r2 * 4;
+                tmp[5][m] = r2 * 6;
+
+                k0 += 3;
+            }
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+
+                short z0 = r0 * 6;
+                short z1 = -r0 * 4 - r1 * 4 - r2 * 4;
+                short z2 = -r0 * 4 + r1 * 4 - r2 * 4;
+                short z3 = r0 + r1 * 2 + r2 * 4;
+                short z4 = r0 - r1 * 2 + r2 * 4;
+                short z5 = r2 * 6;
+
+                ptmp[0] = z0;
+                ptmp[1] = z1;
+                ptmp[2] = z2;
+                ptmp[3] = z3;
+                ptmp[4] = z4;
+                ptmp[5] = z5;
+                ptmp += 6;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
+{
+    const int M = outch;
+    const int K = inch;
+    const int B = 36;
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 4u, (Allocator*)0);
+
+    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat A_tile = A_tileX.channel(get_omp_thread_num());
+
+        for (int k = 0; k < K; k += TILE_K)
+        {
+            const int max_ii = std::min((M - i), TILE_M);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            conv3x3s1_winograd43_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);
+
+            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
+{
+    // const float itm[4][4] = {
+    //     {4,  0, -5,  0, 1, 0},
+    //     {0, -4, -4,  1, 1, 0},
+    //     {0,  4, -4, -1, 1, 0},
+    //     {0, -2, -1,  2, 1, 0},
+    //     {0,  2, -1, -2, 1, 0},
+    //     {0,  4,  0, -5, 0, 1}
+    // };
+
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int elempack = bottom_blob.elempack;
+    const int N = bottom_blob.cstep * elempack;
+
+    const int w_tiles = (w + 1) / 4;
+
+    int nn_max_kk = 0;
+    int remain_max_kk_start = 0;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+    #pragma omp parallel for num_threads(nT)
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 2;
+
+        short tmp[6][6][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 6; m++)
+            {
+                signed char r00 = 0;
+                signed char r01 = 0;
+                signed char r10 = 0;
+                signed char r11 = 0;
+                signed char r20 = 0;
+                signed char r21 = 0;
+                signed char r30 = 0;
+                signed char r31 = 0;
+                signed char r40 = 0;
+                signed char r41 = 0;
+                signed char r50 = 0;
+                signed char r51 = 0;
+
+                if (ti * 4 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        r00 = r0[0];
+                        r01 = r1[0];
+                        if (tj * 4 + 1 < w)
+                        {
+                            r10 = r0[1];
+                            r11 = r1[1];
+                        }
+                        if (tj * 4 + 2 < w)
+                        {
+                            r20 = r0[2];
+                            r21 = r1[2];
+                        }
+                        if (tj * 4 + 3 < w)
+                        {
+                            r30 = r0[3];
+                            r31 = r1[3];
+                        }
+                        if (tj * 4 + 4 < w)
+                        {
+                            r40 = r0[4];
+                            r41 = r1[4];
+                        }
+                        if (tj * 4 + 5 < w)
+                        {
+                            r50 = r0[5];
+                            r51 = r1[5];
+                        }
+                    }
+                }
+
+                short tmp120a = r30 - r10 * 4;
+                short tmp121a = r31 - r11 * 4;
+                short tmp120b = r40 - r20 * 4;
+                short tmp121b = r41 - r21 * 4;
+                short tmp340a = (r30 - r10) * 2;
+                short tmp341a = (r31 - r11) * 2;
+                short tmp340b = r40 - r20;
+                short tmp341b = r41 - r21;
+
+                tmp[0][m][0] = r40 + r00 * 4 - r20 * 5;
+                tmp[0][m][1] = r41 + r01 * 4 - r21 * 5;
+                tmp[1][m][0] = tmp120b + tmp120a;
+                tmp[1][m][1] = tmp121b + tmp121a;
+                tmp[2][m][0] = tmp120b - tmp120a;
+                tmp[2][m][1] = tmp121b - tmp121a;
+                tmp[3][m][0] = tmp340b + tmp340a;
+                tmp[3][m][1] = tmp341b + tmp341a;
+                tmp[4][m][0] = tmp340b - tmp340a;
+                tmp[4][m][1] = tmp341b - tmp341a;
+                tmp[5][m][0] = r50 + r10 * 4 - r30 * 5;
+                tmp[5][m][1] = r51 + r11 * 4 - r31 * 5;
+
+                r0 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj * 2;
+            short* p1 = p0 + max_jj * 2;
+            short* p2 = p0 + max_jj * 2 * 2;
+            short* p3 = p0 + max_jj * 2 * 3;
+            short* p4 = p0 + max_jj * 2 * 4;
+            short* p5 = p0 + max_jj * 2 * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r00 = tmp[m][0][0];
+                short r01 = tmp[m][0][1];
+                short r10 = tmp[m][1][0];
+                short r11 = tmp[m][1][1];
+                short r20 = tmp[m][2][0];
+                short r21 = tmp[m][2][1];
+                short r30 = tmp[m][3][0];
+                short r31 = tmp[m][3][1];
+                short r40 = tmp[m][4][0];
+                short r41 = tmp[m][4][1];
+                short r50 = tmp[m][5][0];
+                short r51 = tmp[m][5][1];
+
+                short tmp120a = r30 - r10 * 4;
+                short tmp121a = r31 - r11 * 4;
+                short tmp120b = r40 - r20 * 4;
+                short tmp121b = r41 - r21 * 4;
+                short tmp340a = (r30 - r10) * 2;
+                short tmp341a = (r31 - r11) * 2;
+                short tmp340b = r40 - r20;
+                short tmp341b = r41 - r21;
+
+                p0[0] = r40 + r00 * 4 - r20 * 5;
+                p0[1] = r41 + r01 * 4 - r21 * 5;
+                p1[0] = tmp120b + tmp120a;
+                p1[1] = tmp121b + tmp121a;
+                p2[0] = tmp120b - tmp120a;
+                p2[1] = tmp121b - tmp121a;
+                p3[0] = tmp340b + tmp340a;
+                p3[1] = tmp341b + tmp341a;
+                p4[0] = tmp340b - tmp340a;
+                p4[1] = tmp341b - tmp341a;
+                p5[0] = r50 + r10 * 4 - r30 * 5;
+                p5[1] = r51 + r11 * 4 - r31 * 5;
+
+                p0 += max_jj * 6 * 2;
+                p1 += max_jj * 6 * 2;
+                p2 += max_jj * 6 * 2;
+                p3 += max_jj * 6 * 2;
+                p4 += max_jj * 6 * 2;
+                p5 += max_jj * 6 * 2;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 2;
+    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
+    {
+        short tmp[6][6];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 6; m++)
+            {
+                signed char r0 = 0;
+                signed char r1 = 0;
+                signed char r2 = 0;
+                signed char r3 = 0;
+                signed char r4 = 0;
+                signed char r5 = 0;
+
+                if (ti * 4 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        r0 = r0123[0];
+                        if (tj * 4 + 1 < w) r1 = r0123[1];
+                        if (tj * 4 + 2 < w) r2 = r0123[2];
+                        if (tj * 4 + 3 < w) r3 = r0123[3];
+                        if (tj * 4 + 4 < w) r4 = r0123[4];
+                        if (tj * 4 + 5 < w) r5 = r0123[5];
+                    }
+                }
+
+                short tmp12a = r3 - r1 * 4;
+                short tmp12b = r4 - r2 * 4;
+                short tmp34a = (r3 - r1) * 2;
+                short tmp34b = r4 - r2;
+
+                tmp[0][m] = r4 + r0 * 4 - r2 * 5;
+                tmp[1][m] = tmp12b + tmp12a;
+                tmp[2][m] = tmp12b - tmp12a;
+                tmp[3][m] = tmp34b + tmp34a;
+                tmp[4][m] = tmp34b - tmp34a;
+                tmp[5][m] = r5 + r1 * 4 - r3 * 5;
+
+                r0123 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj;
+            short* p1 = p0 + max_jj;
+            short* p2 = p0 + max_jj * 2;
+            short* p3 = p0 + max_jj * 3;
+            short* p4 = p0 + max_jj * 4;
+            short* p5 = p0 + max_jj * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+                short r3 = tmp[m][3];
+                short r4 = tmp[m][4];
+                short r5 = tmp[m][5];
+
+                short tmp12a = r3 - r1 * 4;
+                short tmp12b = r4 - r2 * 4;
+                short tmp34a = (r3 - r1) * 2;
+                short tmp34b = r4 - r2;
+
+                p0[0] = r4 + r0 * 4 - r2 * 5;
+                p1[0] = tmp12b + tmp12a;
+                p2[0] = tmp12b - tmp12a;
+                p3[0] = tmp34b + tmp34a;
+                p4[0] = tmp34b - tmp34a;
+                p5[0] = r5 + r1 * 4 - r3 * 5;
+
+                p0 += max_jj * 6;
+                p1 += max_jj * 6;
+                p2 += max_jj * 6;
+                p3 += max_jj * 6;
+                p4 += max_jj * 6;
+                p5 += max_jj * 6;
+            }
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
+{
+    // const int otm[4][6] = {
+    //     {1, 1,  1, 1,  1, 0},
+    //     {0, 1, -1, 2, -2, 0},
+    //     {0, 1,  1, 4,  4, 0},
+    //     {0, 1, -1, 8, -8, 1}
+    // };
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int N = top_blob.cstep * out_elempack;
+
+    const int w_tiles = (outw + 3) / 4;
+
+    int ii = 0;
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        int tmp[4][6][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 2;
+            const int* r1 = r0 + max_jj * 2;
+            const int* r2 = r0 + max_jj * 2 * 2;
+            const int* r3 = r0 + max_jj * 2 * 3;
+            const int* r4 = r0 + max_jj * 2 * 4;
+            const int* r5 = r0 + max_jj * 2 * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                int tmp02a0 = r1[0] + r2[0];
+                int tmp02a1 = r1[1] + r2[1];
+                int tmp02b0 = r3[0] + r4[0];
+                int tmp02b1 = r3[1] + r4[1];
+                int tmp13a0 = r1[0] - r2[0];
+                int tmp13a1 = r1[1] - r2[1];
+                int tmp13b0 = r3[0] - r4[0];
+                int tmp13b1 = r3[1] - r4[1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
+                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;
+
+                tmp[0][m][0] = tmp00;
+                tmp[0][m][1] = tmp01;
+                tmp[1][m][0] = tmp10;
+                tmp[1][m][1] = tmp11;
+                tmp[2][m][0] = tmp20;
+                tmp[2][m][1] = tmp21;
+                tmp[3][m][0] = tmp30;
+                tmp[3][m][1] = tmp31;
+
+                r0 += max_jj * 6 * 2;
+                r1 += max_jj * 6 * 2;
+                r2 += max_jj * 6 * 2;
+                r3 += max_jj * 6 * 2;
+                r4 += max_jj * 6 * 2;
+                r5 += max_jj * 6 * 2;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                int tmp02a0 = r1[0] + r2[0];
+                int tmp02a1 = r1[1] + r2[1];
+                int tmp02b0 = r3[0] + r4[0];
+                int tmp02b1 = r3[1] + r4[1];
+                int tmp13a0 = r1[0] - r2[0];
+                int tmp13a1 = r1[1] - r2[1];
+                int tmp13b0 = r3[0] - r4[0];
+                int tmp13b1 = r3[1] - r4[1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
+                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;
+
+                tmp00 = tmp00 * 4;
+                tmp01 = tmp01 * 4;
+                tmp10 = tmp10 * 4;
+                tmp11 = tmp11 * 4;
+                tmp20 = tmp20 * 4;
+                tmp21 = tmp21 * 4;
+                tmp30 = tmp30 * 4;
+                tmp31 = tmp31 * 4;
+
+                tmp[0][m][0] = tmp00;
+                tmp[0][m][1] = tmp01;
+                tmp[1][m][0] = tmp10;
+                tmp[1][m][1] = tmp11;
+                tmp[2][m][0] = tmp20;
+                tmp[2][m][1] = tmp21;
+                tmp[3][m][0] = tmp30;
+                tmp[3][m][1] = tmp31;
+
+                r0 += max_jj * 6 * 2;
+                r1 += max_jj * 6 * 2;
+                r2 += max_jj * 6 * 2;
+                r3 += max_jj * 6 * 2;
+                r4 += max_jj * 6 * 2;
+                r5 += max_jj * 6 * 2;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+                int tmp02a0 = tmp[m][1][0] + tmp[m][2][0];
+                int tmp02a1 = tmp[m][1][1] + tmp[m][2][1];
+                int tmp02b0 = tmp[m][3][0] + tmp[m][4][0];
+                int tmp02b1 = tmp[m][3][1] + tmp[m][4][1];
+                int tmp13a0 = tmp[m][1][0] - tmp[m][2][0];
+                int tmp13a1 = tmp[m][1][1] - tmp[m][2][1];
+                int tmp13b0 = tmp[m][3][0] - tmp[m][4][0];
+                int tmp13b1 = tmp[m][3][1] - tmp[m][4][1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + tmp[m][0][0];
+                int tmp01 = tmp02a1 + tmp02b1 + tmp[m][0][1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + tmp[m][5][0];
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + tmp[m][5][1];
+
+                tmp00 = tmp00 / 576;
+                tmp01 = tmp01 / 576;
+                tmp10 = tmp10 / 576;
+                tmp11 = tmp11 / 576;
+                tmp20 = tmp20 / 576;
+                tmp21 = tmp21 / 576;
+                tmp30 = tmp30 / 576;
+                tmp31 = tmp31 / 576;
+
+                // if (out_elempack == 1)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    outptr0[0] = tmp00;
+                    outptr1[0] = tmp01;
+                    if (tj * 4 + 1 < outw)
+                    {
+                        outptr0[1] = tmp10;
+                        outptr1[1] = tmp11;
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        outptr0[2] = tmp20;
+                        outptr1[2] = tmp21;
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        outptr0[3] = tmp30;
+                        outptr1[3] = tmp31;
+                    }
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        int tmp[4][6];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj;
+            const int* r1 = r0 + max_jj;
+            const int* r2 = r0 + max_jj * 2;
+            const int* r3 = r0 + max_jj * 3;
+            const int* r4 = r0 + max_jj * 4;
+            const int* r5 = r0 + max_jj * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                int tmp02a = r1[0] + r2[0];
+                int tmp02b = r3[0] + r4[0];
+                int tmp13a = r1[0] - r2[0];
+                int tmp13b = r3[0] - r4[0];
+
+                int tmp0 = tmp02a + tmp02b + r0[0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;
+
+                tmp[0][m] = tmp0;
+                tmp[1][m] = tmp1;
+                tmp[2][m] = tmp2;
+                tmp[3][m] = tmp3;
+
+                r0 += max_jj * 6;
+                r1 += max_jj * 6;
+                r2 += max_jj * 6;
+                r3 += max_jj * 6;
+                r4 += max_jj * 6;
+                r5 += max_jj * 6;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                int tmp02a = r1[0] + r2[0];
+                int tmp02b = r3[0] + r4[0];
+                int tmp13a = r1[0] - r2[0];
+                int tmp13b = r3[0] - r4[0];
+
+                int tmp0 = tmp02a + tmp02b + r0[0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;
+
+                tmp0 = tmp0 * 4;
+                tmp1 = tmp1 * 4;
+                tmp2 = tmp2 * 4;
+                tmp3 = tmp3 * 4;
+
+                tmp[0][m] = tmp0;
+                tmp[1][m] = tmp1;
+                tmp[2][m] = tmp2;
+                tmp[3][m] = tmp3;
+
+                r0 += max_jj * 6;
+                r1 += max_jj * 6;
+                r2 += max_jj * 6;
+                r3 += max_jj * 6;
+                r4 += max_jj * 6;
+                r5 += max_jj * 6;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+                int tmp02a = tmp[m][1] + tmp[m][2];
+                int tmp02b = tmp[m][3] + tmp[m][4];
+                int tmp13a = tmp[m][1] - tmp[m][2];
+                int tmp13b = tmp[m][3] - tmp[m][4];
+
+                int tmp0 = tmp02a + tmp02b + tmp[m][0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + tmp[m][5];
+
+                tmp0 = tmp0 / 576;
+                tmp1 = tmp1 / 576;
+                tmp2 = tmp2 / 576;
+                tmp3 = tmp3 / 576;
+
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = tmp0;
+                    if (tj * 4 + 1 < outw) outptr0[1] = tmp1;
+                    if (tj * 4 + 2 < outw) outptr0[2] = tmp2;
+                    if (tj * 4 + 3 < outw) outptr0[3] = tmp3;
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
+{
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    // pad to 4n+2, winograd F(4,3)
+    int w_tiles = (outw + 3) / 4;
+    int h_tiles = (outh + 3) / 4;
+    int tiles = w_tiles * h_tiles;
+
+    const int M = top_blob.c * top_blob.elempack;
+    const int N = tiles;
+    const int K = bottom_blob.c * bottom_blob.elempack;
+    const int B = 36;
+
+    // NCNN_LOGE("conv3x3s1_winograd43_int8 %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+    const int nn_N = (N + TILE_N - 1) / TILE_N;
+    const int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
+
+    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    if (nT > 1 && nn_NK < nT)
+    {
+        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            // transform input
+            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
+        }
+    }
+    else
+    {
+        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(nT)
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            Mat B_tile = B_tileX.channel(get_omp_thread_num());
+
+            // transform input
+            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
+        }
+    }
+
+    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat top_tile = top_tileX.channel(get_omp_thread_num());
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+                bool k_end = k + TILE_K >= K;
+
+                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end);
+            }
+
+            // transform output
+            conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
+        }
+    }
+}
diff --git a/src/layer/riscv/convolution_im2col_gemm_int8.h b/src/layer/riscv/convolution_im2col_gemm_int8.h
new file mode 100644
index 000000000000..4d615e58b4d5
--- /dev/null
+++ b/src/layer/riscv/convolution_im2col_gemm_int8.h
@@ -0,0 +1,953 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
+{
+    // A = (pa, maxk, inch/pa), outch
+    const int A_hstep = A.w;
+
+    signed char* pp = AT;
+
+    int ii = 0;
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;
+        const signed char* p1 = (const signed char*)A + (i + ii + 1) * A_hstep + k;
+
+        int kk = 0;
+        for (; kk + 1 < max_kk; kk += 2)
+        {
+            pp[0] = p0[0];
+            pp[1] = p0[1];
+            pp[2] = p1[0];
+            pp[3] = p1[1];
+            pp += 4;
+            p0 += 2;
+            p1 += 2;
+        }
+        for (; kk < max_kk; kk++)
+        {
+            pp[0] = p0[0];
+            pp[1] = p1[0];
+            pp += 2;
+            p0++;
+            p1++;
+        }
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;
+
+        int kk = 0;
+        for (; kk < max_kk; kk++)
+        {
+            pp[0] = p0[0];
+            pp += 1;
+            p0++;
+        }
+    }
+}
+
+static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
+{
+    // NCNN_LOGE("convolution_gemm_transB_packed_tile_int8 %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);
+
+    const int out_elempack = top_blob.elempack;
+    const int out_hstep = (int)top_blob.cstep;
+
+    const signed char* pAT = AT_tile;
+    const signed char* pBT = BT_tile;
+
+    int* outptr = topT_tile;
+
+    int ii = 0;
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j;
+
+        const signed char* pB = pBT;
+
+        int jj = 0;
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            int sum00;
+            int sum10;
+            int sum01;
+            int sum11;
+
+            if (k == 0)
+            {
+                sum00 = 0;
+                sum10 = 0;
+                sum01 = 0;
+                sum11 = 0;
+            }
+            else
+            {
+                sum00 = outptr[0];
+                sum10 = outptr[1];
+                sum01 = outptr[2];
+                sum11 = outptr[3];
+            }
+
+            const signed char* pA = pAT;
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                sum00 += pA[0] * pB[0];
+                sum00 += pA[1] * pB[1];
+                sum10 += pA[2] * pB[0];
+                sum10 += pA[3] * pB[1];
+                sum01 += pA[0] * pB[2];
+                sum01 += pA[1] * pB[3];
+                sum11 += pA[2] * pB[2];
+                sum11 += pA[3] * pB[3];
+                pA += 4;
+                pB += 4;
+            }
+            for (; kk < max_kk; kk += 1)
+            {
+                sum00 += pA[0] * pB[0];
+                sum10 += pA[1] * pB[0];
+                sum01 += pA[0] * pB[1];
+                sum11 += pA[1] * pB[1];
+                pA += 2;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum00;
+                    outptr0[1] = sum01;
+                    outptr0[out_hstep] = sum10;
+                    outptr0[out_hstep + 1] = sum11;
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                outptr[0] = sum00;
+                outptr[1] = sum10;
+                outptr[2] = sum01;
+                outptr[3] = sum11;
+            }
+
+            outptr += 4;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            int sum0;
+            int sum1;
+
+            if (k == 0)
+            {
+                sum0 = 0;
+                sum1 = 0;
+            }
+            else
+            {
+                sum0 = outptr[0];
+                sum1 = outptr[1];
+            }
+
+            const signed char* pA = pAT;
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                sum0 += pA[0] * pB[0];
+                sum0 += pA[1] * pB[1];
+                sum1 += pA[2] * pB[0];
+                sum1 += pA[3] * pB[1];
+                pA += 4;
+                pB += 2;
+            }
+            for (; kk < max_kk; kk += 1)
+            {
+                sum0 += pA[0] * pB[0];
+                sum1 += pA[1] * pB[0];
+                pA += 2;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum0;
+                    outptr0[out_hstep] = sum1;
+                    outptr0++;
+                }
+            }
+            else
+            {
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+            }
+
+            outptr += 2;
+        }
+
+        pAT += max_kk * 2;
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j;
+
+        const signed char* pB = pBT;
+
+        int jj = 0;
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            int sum0;
+            int sum1;
+
+            if (k == 0)
+            {
+                sum0 = 0;
+                sum1 = 0;
+            }
+            else
+            {
+                sum0 = outptr[0];
+                sum1 = outptr[1];
+            }
+
+            const signed char* pA = pAT;
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                sum0 += pA[0] * pB[0];
+                sum0 += pA[1] * pB[1];
+                sum1 += pA[0] * pB[2];
+                sum1 += pA[1] * pB[3];
+                pA += 2;
+                pB += 4;
+            }
+            for (; kk < max_kk; kk += 1)
+            {
+                sum0 += pA[0] * pB[0];
+                sum1 += pA[0] * pB[1];
+                pA += 1;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum0;
+                    outptr0[1] = sum1;
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+            }
+
+            outptr += 2;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            int sum;
+
+            if (k == 0)
+            {
+                sum = 0;
+            }
+            else
+            {
+                sum = outptr[0];
+            }
+
+            const signed char* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                sum += pA[0] * pB[0];
+                pA += 1;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum;
+                    outptr0++;
+                }
+            }
+            else
+            {
+                outptr[0] = sum;
+            }
+
+            outptr += 1;
+        }
+
+        pAT += max_kk;
+    }
+}
+
+static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
+{
+    // resolve optimal tile size from cache size
+    const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char));
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
+    // solve K
+    {
+        // try not to split K
+        int tile_size = (l2_cache_size_int8 - 2) / 3;
+        TILE_K = std::max(2, tile_size / 2 * 2);
+
+        int nn_K = (K + TILE_K - 1) / TILE_K;
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
+    }
+
+    // solve M
+    {
+        int nn_M = (M + 7) / 8;
+        TILE_M = std::max(2, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
+    }
+
+    {
+        TILE_M *= std::min(nT, get_physical_cpu_count());
+
+        int nn_M = (M + TILE_M - 1) / TILE_M;
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
+
+        if (nT > 1)
+        {
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
+        }
+    }
+
+    if (N > 0)
+    {
+        int tile_size;
+        if (TILE_K >= K)
+        {
+            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / TILE_K;
+        }
+        else
+        {
+            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 4 + TILE_K);
+        }
+        TILE_N = std::max(1, tile_size);
+
+        int nn_N = (N + TILE_N - 1) / TILE_N;
+        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
+    }
+}
+
+static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+{
+    const int elempack = bottom_blob.elempack;
+
+    signed char* pp = B;
+
+    int jj = 0;
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        if (elempack == 1)
+        {
+            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[bottom_blob.cstep];
+                pp[2] = p0[1];
+                pp[3] = p0[bottom_blob.cstep + 1];
+                pp += 4;
+                p0 += bottom_blob.cstep * 2;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp += 2;
+                p0 += bottom_blob.cstep;
+            }
+        }
+    }
+    for (; jj < max_jj; jj++)
+    {
+        if (elempack == 1)
+        {
+            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp += 1;
+                p0 += bottom_blob.cstep;
+            }
+        }
+    }
+}
+
+template<int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h>
+void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+{
+    const int w = bottom_blob.w;
+    // const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+
+    // j max_jj     outw*outh    split w and h
+
+    // k max_kk     pa*maxk*(inch/pa)    split inch
+
+    // k/max_kk shall be multiple of maxk
+
+    const int maxk = kernel_w * kernel_h;
+
+    signed char* pp = B;
+
+    int jj = 0;
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+
+        if (dy0 == dy1)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
+            }
+        }
+    }
+    for (; jj < max_jj; jj++)
+    {
+        int dy = (j + jj) / outw;
+        int dx = (j + jj) % outw;
+
+        int kk = 0;
+        for (; kk < max_kk / elempack; kk++)
+        {
+            int p = (k / elempack + kk) / maxk;
+            int uv = (k / elempack + kk) % maxk;
+            int u = uv / kernel_w;
+            int v = uv % kernel_w;
+
+            const Mat img = bottom_blob.channel(p);
+
+            int x = stride_w * dx + dilation_w * v;
+            int y = stride_h * dy + dilation_h * u;
+
+            const signed char* sptr = img.row<const signed char>(y) + x * elempack;
+
+            if (elempack == 1)
+            {
+                pp[0] = sptr[0];
+                pp += 1;
+            }
+        }
+    }
+}
+
+template void convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+
+static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
+{
+    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 1 && kernel_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+        convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+        convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    const int w = bottom_blob.w;
+    // const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+
+    // j max_jj     outw*outh    split w and h
+
+    // k max_kk     pa*maxk*(inch/pa)    split inch
+
+    // k/max_kk shall be multiple of maxk
+
+    const int maxk = kernel_w * kernel_h;
+
+    signed char* pp = B;
+
+    int jj = 0;
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+
+        if (dy0 == dy1)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
+            }
+        }
+    }
+    for (; jj < max_jj; jj++)
+    {
+        int dy = (j + jj) / outw;
+        int dx = (j + jj) % outw;
+
+        int kk = 0;
+        for (; kk < max_kk / elempack; kk++)
+        {
+            int p = (k / elempack + kk) / maxk;
+            int uv = (k / elempack + kk) % maxk;
+            int u = uv / kernel_w;
+            int v = uv % kernel_w;
+
+            const Mat img = bottom_blob.channel(p);
+
+            int x = stride_w * dx + dilation_w * v;
+            int y = stride_h * dy + dilation_h * u;
+
+            const signed char* sptr = img.row<const signed char>(y) + x * elempack;
+
+            if (elempack == 1)
+            {
+                pp[0] = sptr[0];
+                pp += 1;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
+{
+    // NCNN_LOGE("convolution_im2col_gemm_transform_kernel");
+    const int maxk = kernel_w * kernel_h;
+
+    const int M = outch;
+    const int K = inch * maxk;
+
+    int TILE_M, TILE_N, TILE_K;
+    convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+    int elempack = 1;
+
+    // maxk-inch-outch to pa-maxk-inch/pa-outch
+    Mat A_data;
+    if (maxk == 1)
+    {
+        A_data = kernel.reshape(maxk * inch, outch);
+    }
+    else
+    {
+        Mat weight_data_r2 = kernel.reshape(maxk, inch, outch);
+
+        A_data.create(maxk * inch, outch, (size_t)1u, 1);
+
+        for (int q = 0; q < outch; q += 1)
+        {
+            signed char* g00 = A_data.row<signed char>(q);
+
+            for (int p = 0; p + (elempack - 1) < inch; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        const signed char* k00 = weight_data_r2.channel(q).row<const signed char>(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+
+    AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)1u, 1);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int k = 0; k < K; k += TILE_K)
+        {
+            const int max_kk = std::min((K - k), TILE_K);
+
+            Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);
+
+            convolution_im2col_pack_A_tile_int8(A_data, AT_tile, i, max_ii, k, max_kk);
+        }
+    }
+}
+
+static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    const int M = top_blob.c * top_blob.elempack;
+    const int N = top_blob.w * top_blob.h;
+    const int K = bottom_blob.c * bottom_blob.elempack * maxk;
+
+    int TILE_M, TILE_N, TILE_K;
+    convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+    const int nn_N = (N + TILE_N - 1) / TILE_N;
+    const int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
+
+    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+    {
+        const int ppj = ppjk / nn_K;
+        const int ppk = ppjk % nn_K;
+
+        const int j = ppj * TILE_N;
+        const int k = ppk * TILE_K;
+
+        const int max_jj = std::min((N - j), TILE_N);
+        const int max_kk = std::min((K - k), TILE_K);
+
+        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+        // im2col
+        convolution_im2col_input_tile_int8(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
+    }
+
+    Mat topT_tileX;
+    if (K > TILE_K)
+        topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat topT_tile;
+        if (K > TILE_K)
+            topT_tile = topT_tileX.channel(get_omp_thread_num());
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);
+
+                const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                bool k_end = k + TILE_K >= K;
+
+                convolution_gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end);
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h
new file mode 100644
index 000000000000..a275f05c6c7e
--- /dev/null
+++ b/src/layer/riscv/convolution_packed_int8.h
@@ -0,0 +1,398 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+
+    // clang-format off
+    // *INDENT-OFF*
+    if (outch >= 2)
+    {
+        if (inch >= 2)
+            kernel_tm.create(maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)4u, 4);
+        else
+            kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2);
+    }
+    else
+    {
+        if (inch >= 2)
+            kernel_tm.create(maxk, inch / 2 + inch % 2, outch, (size_t)2u, 2);
+        else
+            kernel_tm.create(maxk, inch, outch, (size_t)1u, 1);
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    int q = 0;
+    for (; q + 1 < outch; q += 2)
+    {
+        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
+        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
+        signed char* g00 = kernel_tm.channel(q / 2);
+
+        int p = 0;
+        for (; p + 1 < inch; p += 2)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+
+                g00[0] = k0[0];
+                g00[1] = k1[0];
+                g00[2] = k0[maxk];
+                g00[3] = k1[maxk];
+                g00 += 4;
+            }
+
+            kptr0 += maxk * 2;
+            kptr1 += maxk * 2;
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+
+                g00[0] = k0[0];
+                g00[1] = k1[0];
+                g00 += 2;
+            }
+        }
+    }
+    for (; q < outch; q++)
+    {
+        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;
+        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
+
+        int p = 0;
+        for (; p + 1 < inch; p += 2)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr + k;
+
+                g00[0] = k0[0];
+                g00[1] = k0[maxk];
+                g00 += 2;
+            }
+
+            kptr += maxk * 2;
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr + k;
+
+                g00[0] = k0[0];
+                g00++;
+            }
+        }
+    }
+}
+
+static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int elempack = bottom_blob.elempack;
+    const int inch = bottom_blob.c * elempack;
+
+    const int N = bottom_blob.cstep * elempack;
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int outch = top_blob.c * out_elempack;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2 * elempack;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = (outch - remain_outch_start) / 2;
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * 2;
+
+        // shadowed variable for less openmp task args
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int N = bottom_blob.cstep * elempack;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+
+        int ij = 0;
+        for (; ij + 1 < outw * outh; ij += 2)
+        {
+            const int i0 = ij / outw;
+            const int i1 = (ij + 1) / outw;
+            const int j0 = ij % outw;
+            const int j1 = (ij + 1) % outw;
+
+            int sum00 = 0;
+            int sum01 = 0;
+            int sum10 = 0;
+            int sum11 = 0;
+
+            const signed char* kptr = weight_data_tm.channel(p / 2);
+
+            int q = 0;
+            for (; q + 1 < inch; q += 2)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum00 += r0s[0] * kptr[0];
+                        sum10 += r0s[0] * kptr[1];
+                        sum00 += r0s[N] * kptr[2];
+                        sum10 += r0s[N] * kptr[3];
+                        sum01 += r1s[0] * kptr[0];
+                        sum11 += r1s[0] * kptr[1];
+                        sum01 += r1s[N] * kptr[2];
+                        sum11 += r1s[N] * kptr[3];
+
+                        kptr += 4;
+                    }
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum00 += r0s[0] * kptr[0];
+                        sum10 += r0s[0] * kptr[1];
+                        sum01 += r1s[0] * kptr[0];
+                        sum11 += r1s[0] * kptr[1];
+
+                        kptr += 2;
+                    }
+                }
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr0 += 2;
+            outptr1 += 2;
+        }
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+            const signed char* kptr = weight_data_tm.channel(p / 2);
+
+            int q = 0;
+            for (; q + 1 < inch; q += 2)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum0 += r0s[0] * kptr[0];
+                        sum1 += r0s[0] * kptr[1];
+                        sum0 += r0s[N] * kptr[2];
+                        sum1 += r0s[N] * kptr[3];
+
+                        kptr += 4;
+                    }
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum0 += r0s[0] * kptr[0];
+                        sum1 += r0s[0] * kptr[1];
+
+                        kptr += 2;
+                    }
+                }
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr0 += 1;
+            outptr1 += 1;
+        }
+    }
+    remain_outch_start += nn_outch * 2;
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        int ij = 0;
+        for (; ij + 1 < outw * outh; ij += 2)
+        {
+            const int i0 = ij / outw;
+            const int i1 = (ij + 1) / outw;
+            const int j0 = ij % outw;
+            const int j1 = (ij + 1) % outw;
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+            const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2);
+
+            int q = 0;
+            for (; q + 1 < inch; q += 2)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum0 += r0s[0] * kptr[0];
+                        sum0 += r0s[N] * kptr[1];
+                        sum1 += r1s[0] * kptr[0];
+                        sum1 += r1s[N] * kptr[1];
+
+                        kptr += 2;
+                    }
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum0 += r0s[0] * kptr[0];
+                        sum1 += r1s[0] * kptr[0];
+
+                        kptr += 1;
+                    }
+                }
+            }
+
+            outptr[0] = sum0;
+            outptr[1] = sum1;
+            outptr += 2;
+        }
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            int sum = 0;
+
+            const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2);
+
+            int q = 0;
+            for (; q + 1 < inch; q += 2)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum += r0s[0] * kptr[0];
+                        sum += r0s[N] * kptr[1];
+
+                        kptr += 2;
+                    }
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum += r0s[0] * kptr[0];
+
+                        kptr += 1;
+                    }
+                }
+            }
+
+            outptr[0] = sum;
+            outptr += 1;
+        }
+    }
+}
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index be413e5be252..b6e470ceae8a 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -35,6 +35,13 @@ namespace ncnn {
 #include "convolution_1x1.h"
 #include "convolution_3x3.h"
 
+#if NCNN_INT8
+#include "convolution_packed_int8.h"
+#include "convolution_im2col_gemm_int8.h"
+
+#include "convolution_3x3_winograd_int8.h"
+#endif // NCNN_INT8
+
 #if __riscv_vector
 #include "convolution_packn.h"
 #include "convolution_pack1ton.h"
@@ -133,8 +140,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        // TODO implement int8
-        return 0;
+        return create_pipeline_int8_riscv(opt);
     }
 #endif
 
@@ -279,7 +285,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
 
         Option opt_unpacked = opt;
         opt_unpacked.use_packing_layout = false;
-        return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
     }
 #endif
 
@@ -1102,4 +1108,137 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
+
+    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        if (opt.use_winograd43_convolution)
+            conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        else
+            conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
+    }
+    else if (opt.use_sgemm_convolution)
+    {
+        convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
+    }
+    else
+    {
+        convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // requantize and relu
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    // NCNN_LOGE("Convolution_riscv input %dx%d ksize=%dx%d stride=%dx%d",
+    //     bottom_blob.w, bottom_blob.h, kernel_w, kernel_h, stride_w, stride_h);
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int channels = bottom_blob_bordered.c;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    // NCNN_LOGE("forward_int8_riscv %dx%dx%d elempack=%d out_elempack=%d int8_scale_term=%d",
+    //     w, h, bottom_blob_bordered.c, elempack, out_elempack, int8_scale_term);
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+    int out_elempack_int32 = 1;
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
+
+    int _nT = opt.num_threads;
+
+    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
+            conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
+        else
+            conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
+    }
+    else if (opt.use_sgemm_convolution)
+    {
+        convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
+    }
+    else
+    {
+        convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+    }
+
+    // if (use_int8_requantize)
+    // {
+    //     requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    // }
+    // else
+    // {
+    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+    if (activation)
+    {
+        activation->forward_inplace(top_blob, opt);
+    }
+    // }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index a4e008c9dd1d..3aa5bcc72587 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -37,17 +37,25 @@ class Convolution_riscv : public Convolution
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
+#if NCNN_INT8
+    int create_pipeline_int8_riscv(const Option& opt);
+    int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
 
 public:
     Layer* activation;
 
     Mat weight_data_tm;
+    Mat weight_sgemm_data;
     Mat weight_winograd23_data;
     Mat weight_winograd43_data;
     Mat weight_winograd63_data;
 
     // fp16
     Mat bias_data_fp16;
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn

From 7cb209189a3a7cb41ed72a6fbd52e3045723e5ad Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Sat, 9 Dec 2023 12:26:33 +0800
Subject: [PATCH 02/10] riscv int8 innerproduct

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/innerproduct_riscv.cpp | 178 ++++++++++++++++++++++++-
 src/layer/riscv/innerproduct_riscv.h   |   9 ++
 2 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index accfc683584f..c0d22817710a 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -52,8 +52,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        // TODO implement int8
-        return 0;
+        return create_pipeline_int8_riscv(opt);
     }
 #endif
 
@@ -148,7 +147,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 
         Option opt_unpacked = opt;
         opt_unpacked.use_packing_layout = false;
-        return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
     }
 #endif
 
@@ -1090,4 +1089,177 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int InnerProduct_riscv::create_pipeline_int8_riscv(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
+                }
+            }
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // dequantize
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
+    {
+        // gemm
+        Mat bottom_blob_int8_unpacked;
+        Option opt_unpack = opt;
+        opt_unpack.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
+
+        int h = bottom_blob_int8_unpacked.h;
+
+        int out_elempack = 1;
+
+        int outh = h / out_elempack;
+
+        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+
+        if (num_output_elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    int sum = 0;
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m++ * *kptr++;
+                    }
+
+                    // dequantize and relu
+                    float sumfp32 = sum * scale_in_data[p];
+
+                    if (bias_term)
+                        sumfp32 += bias_data[p];
+
+                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_int8_flattened = bottom_blob_int8;
+    if (bottom_blob_int8.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
+    }
+
+    //     int elempack = bottom_blob_int8_flattened.elempack;
+
+    int out_elempack = 1;
+    //     size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (out_elempack == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            int sum = 0;
+
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                signed char val = sptr[0];
+
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                sptr += 1;
+                kptr += 1;
+            }
+
+            // dequantize and relu
+            float sumfp32 = sum * scale_in_data[p];
+
+            if (bias_term)
+                sumfp32 += bias_data[p];
+
+            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+            top_blob[p] = sumfp32;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
 } // namespace ncnn
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index d3056d5801d0..9b44bf8a3cca 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 
+#if NCNN_INT8
+    int create_pipeline_int8_riscv(const Option& opt);
+    int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* flatten;
 
@@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct
 
     // fp16
     Mat bias_data_fp16;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn

From bd2fd74e5ffbd7039dd0e98a1bc2b3e9f91d6dc9 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Fri, 22 Dec 2023 08:25:35 +0800
Subject: [PATCH 03/10] rvv conv int8

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/convolution_1x1_int8.h        |  26 +
 src/layer/riscv/convolution_riscv.cpp         |  48 +-
 src/layer/riscv/convolution_sgemm_int8.h      | 628 +++++++++++++++
 .../riscv/convolution_sgemm_packnto1_int8.h   | 720 ++++++++++++++++++
 4 files changed, 1400 insertions(+), 22 deletions(-)
 create mode 100644 src/layer/riscv/convolution_1x1_int8.h
 create mode 100644 src/layer/riscv/convolution_sgemm_int8.h
 create mode 100644 src/layer/riscv/convolution_sgemm_packnto1_int8.h

diff --git a/src/layer/riscv/convolution_1x1_int8.h b/src/layer/riscv/convolution_1x1_int8.h
new file mode 100644
index 000000000000..6d0f546d25c1
--- /dev/null
+++ b/src/layer/riscv/convolution_1x1_int8.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_int8_rvv(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index b6e470ceae8a..99f71f0f0bab 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -37,9 +37,10 @@ namespace ncnn {
 
 #if NCNN_INT8
 #include "convolution_packed_int8.h"
-#include "convolution_im2col_gemm_int8.h"
 
 #include "convolution_3x3_winograd_int8.h"
+#include "convolution_sgemm_int8.h"
+#include "convolution_1x1_int8.h"
 #endif // NCNN_INT8
 
 #if __riscv_vector
@@ -1116,7 +1117,11 @@ int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt)
 
     bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
 
-    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_sgemm_transform_kernel_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
+    }
+    else if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
     {
         if (opt.use_winograd43_convolution)
             conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
@@ -1125,7 +1130,8 @@ int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt)
     }
     else if (opt.use_sgemm_convolution)
     {
-        convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
+        convolution_im2col_sgemm_transform_kernel_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
+        // convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
     }
     else
     {
@@ -1165,9 +1171,6 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob,
         quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
     }
 
-    // NCNN_LOGE("Convolution_riscv input %dx%d ksize=%dx%d stride=%dx%d",
-    //     bottom_blob.w, bottom_blob.h, kernel_w, kernel_h, stride_w, stride_h);
-
     Mat bottom_blob_bordered;
     make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
     if (bottom_blob_bordered.empty())
@@ -1188,9 +1191,6 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob,
     int out_elempack = 1;
     size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
 
-    // NCNN_LOGE("forward_int8_riscv %dx%dx%d elempack=%d out_elempack=%d int8_scale_term=%d",
-    //     w, h, bottom_blob_bordered.c, elempack, out_elempack, int8_scale_term);
-
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
@@ -1207,7 +1207,11 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob,
 
     int _nT = opt.num_threads;
 
-    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        conv1x1s1_sgemm_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);
+    }
+    else if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
     {
         if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
             conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
@@ -1216,26 +1220,26 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob,
     }
     else if (opt.use_sgemm_convolution)
     {
-        convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
+        convolution_im2col_sgemm_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
     }
     else
     {
         convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
     }
 
-    // if (use_int8_requantize)
-    // {
-    //     requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
-    // }
-    // else
-    // {
-    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
-
-    if (activation)
+    if (use_int8_requantize)
+    {
+        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    }
+    else
     {
-        activation->forward_inplace(top_blob, opt);
+        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
     }
-    // }
 
     return 0;
 }
diff --git a/src/layer/riscv/convolution_sgemm_int8.h b/src/layer/riscv/convolution_sgemm_int8.h
new file mode 100644
index 000000000000..276be4e92ca7
--- /dev/null
+++ b/src/layer/riscv/convolution_sgemm_int8.h
@@ -0,0 +1,628 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_int8_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+#if __riscv_vector
+    int packn = csrr_vlenb();
+    size_t vl = vsetvl_e8m1(packn);
+#else
+    int packn = 4;
+    size_t vl = 4;
+#endif
+
+    // Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= packn)
+        tmp.create(packn * maxk, inch, size / packn + size % packn, 1u, 1, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    {
+        int nn_size = size / packn;
+        int remain_size_start = nn_size * packn;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = ii * packn;
+
+            int8_t* tmpptr = tmp.channel(i / packn);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if __riscv_vector
+                    vse8_v_i8m1(tmpptr, vle8_v_i8m1(img0, vl), vl);
+#else
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+                    tmpptr[2] = img0[2];
+                    tmpptr[3] = img0[3];
+#endif
+                    img0 += size;
+                    tmpptr += packn;
+                }
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int8_t* tmpptr = tmp.channel(i / packn + i % packn);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    img0 += size;
+                    tmpptr += 1;
+                }
+            }
+        }
+    }
+
+
+#if __riscv_vector
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+        int* outptr4 = top_blob.channel(p + 4);
+        int* outptr5 = top_blob.channel(p + 5);
+        int* outptr6 = top_blob.channel(p + 6);
+        int* outptr7 = top_blob.channel(p + 7);
+
+        int i = 0;
+        for (; i + (packn - 1) < size; i += packn)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn);
+            const int8_t* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum1_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum2_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum3_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum4_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum5_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum6_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum7_32 = vmv_v_x_i32m4(0, vl);
+
+            for (int q = 0; q < nn; q++)
+            {
+                vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl);
+                _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl);
+                _sum1_32 = vwmacc_vx_i32m4(_sum1_32, kptr[1], _val, vl);
+                _sum2_32 = vwmacc_vx_i32m4(_sum2_32, kptr[2], _val, vl);
+                _sum3_32 = vwmacc_vx_i32m4(_sum3_32, kptr[3], _val, vl);
+                _sum4_32 = vwmacc_vx_i32m4(_sum4_32, kptr[4], _val, vl);
+                _sum5_32 = vwmacc_vx_i32m4(_sum5_32, kptr[5], _val, vl);
+                _sum6_32 = vwmacc_vx_i32m4(_sum6_32, kptr[6], _val, vl);
+                _sum7_32 = vwmacc_vx_i32m4(_sum7_32, kptr[7], _val, vl);
+                tmpptr += packn;
+                kptr += 8;
+            }
+
+            vse32_v_i32m4(outptr0, _sum0_32, vl);
+            vse32_v_i32m4(outptr1, _sum1_32, vl);
+            vse32_v_i32m4(outptr2, _sum2_32, vl);
+            vse32_v_i32m4(outptr3, _sum3_32, vl);
+            vse32_v_i32m4(outptr4, _sum4_32, vl);
+            vse32_v_i32m4(outptr5, _sum5_32, vl);
+            vse32_v_i32m4(outptr6, _sum6_32, vl);
+            vse32_v_i32m4(outptr7, _sum7_32, vl);
+
+            outptr0 += packn;
+            outptr1 += packn;
+            outptr2 += packn;
+            outptr3 += packn;
+            outptr4 += packn;
+            outptr5 += packn;
+            outptr6 += packn;
+            outptr7 += packn;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn + i % packn);
+            const int8_t* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
+            int sum4 = 0;
+            int sum5 = 0;
+            int sum6 = 0;
+            int sum7 = 0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                sum4 += tmpptr[0] * kptr[4];
+                sum5 += tmpptr[0] * kptr[5];
+                sum6 += tmpptr[0] * kptr[6];
+                sum7 += tmpptr[0] * kptr[7];
+                tmpptr++;
+                kptr += 8;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+            outptr4[0] = sum4;
+            outptr5[0] = sum5;
+            outptr6[0] = sum6;
+            outptr7[0] = sum7;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+            outptr4++;
+            outptr5++;
+            outptr6++;
+            outptr7++;
+        }
+    }
+
+    nn_outch = (outch - remain_outch_start) >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + (packn - 1) < size; i += packn)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn);
+            const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum1_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum2_32 = vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum3_32 = vmv_v_x_i32m4(0, vl);
+
+            for (int q = 0; q < nn; q++)
+            {
+                vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl);
+                _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl);
+                _sum1_32 = vwmacc_vx_i32m4(_sum1_32, kptr[1], _val, vl);
+                _sum2_32 = vwmacc_vx_i32m4(_sum2_32, kptr[2], _val, vl);
+                _sum3_32 = vwmacc_vx_i32m4(_sum3_32, kptr[3], _val, vl);
+
+                tmpptr += packn;
+                kptr += 4;
+            }
+
+            vse32_v_i32m4(outptr0, _sum0_32, vl);
+            vse32_v_i32m4(outptr1, _sum1_32, vl);
+            vse32_v_i32m4(outptr2, _sum2_32, vl);
+            vse32_v_i32m4(outptr3, _sum3_32, vl);
+
+            outptr0 += packn;
+            outptr1 += packn;
+            outptr2 += packn;
+            outptr3 += packn;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn + i % packn);
+            const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                tmpptr++;
+                kptr += 4;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#else
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 4);
+            const int8_t* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int sum00 = 0;
+            int sum01 = 0;
+            int sum02 = 0;
+            int sum03 = 0;
+            int sum10 = 0;
+            int sum11 = 0;
+            int sum12 = 0;
+            int sum13 = 0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                int8_t k0 = kptr[0];
+                int8_t k1 = kptr[1];
+                sum00 += tmpptr[0] * k0;
+                sum01 += tmpptr[1] * k0;
+                sum02 += tmpptr[2] * k0;
+                sum03 += tmpptr[3] * k0;
+                sum10 += tmpptr[0] * k1;
+                sum11 += tmpptr[1] * k1;
+                sum12 += tmpptr[2] * k1;
+                sum13 += tmpptr[3] * k1;
+                tmpptr += 4;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr0[2] = sum02;
+            outptr0[3] = sum03;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr1[2] = sum12;
+            outptr1[3] = sum13;
+
+            outptr0 += 4;
+            outptr1 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 4 + i % 4);
+            const int8_t* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                tmpptr++;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+
+            outptr0++;
+            outptr1++;
+        }
+    }
+#endif
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + (packn - 1) < size; i += packn)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn);
+#if __riscv_vector
+            const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const int8_t* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+#if __riscv_vector
+            vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl);
+#else
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
+#endif
+
+            for (int q = 0; q < nn; q++)
+            {
+#if __riscv_vector
+                vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl);
+                _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl);
+#else
+                int8_t k0 = kptr[0];
+                sum0 += tmpptr[0] * k0;
+                sum1 += tmpptr[1] * k0;
+                sum2 += tmpptr[2] * k0;
+                sum3 += tmpptr[3] * k0;
+#endif
+                tmpptr += packn;
+                kptr++;
+            }
+
+#if __riscv_vector
+            vse32_v_i32m4(outptr0, _sum0_32, vl);
+#else
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+#endif
+            outptr0 += packn;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / packn + i % packn);
+#if __riscv_vector
+            const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const int8_t* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+            int sum0 = 0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                tmpptr++;
+                kptr++;
+            }
+
+            outptr0[0] = sum0;
+
+            outptr0++;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_int8_rvv(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8b-maxk-inch-outch/8b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+#if __riscv_vector
+    kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4, (size_t)1u);
+#else
+    kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u);
+#endif
+    int q = 0;
+#if __riscv_vector
+    for (; q + 7 < outch; q += 8)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+        const Mat k4 = kernel.channel(q + 4);
+        const Mat k5 = kernel.channel(q + 5);
+        const Mat k6 = kernel.channel(q + 6);
+        const Mat k7 = kernel.channel(q + 7);
+
+        int8_t* g00 = kernel_tm.channel(q / 8);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const int8_t* k00 = (const int8_t*)k0.row(p);
+            const int8_t* k10 = (const int8_t*)k1.row(p);
+            const int8_t* k20 = (const int8_t*)k2.row(p);
+            const int8_t* k30 = (const int8_t*)k3.row(p);
+            const int8_t* k40 = (const int8_t*)k4.row(p);
+            const int8_t* k50 = (const int8_t*)k5.row(p);
+            const int8_t* k60 = (const int8_t*)k6.row(p);
+            const int8_t* k70 = (const int8_t*)k7.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = (int8_t)k00[k];
+                g00[1] = (int8_t)k10[k];
+                g00[2] = (int8_t)k20[k];
+                g00[3] = (int8_t)k30[k];
+                g00[4] = (int8_t)k40[k];
+                g00[5] = (int8_t)k50[k];
+                g00[6] = (int8_t)k60[k];
+                g00[7] = (int8_t)k70[k];
+
+                g00 += 8;
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+
+        int8_t* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const int8_t* k00 = (const int8_t*)k0.row(p);
+            const int8_t* k10 = (const int8_t*)k1.row(p);
+            const int8_t* k20 = (const int8_t*)k2.row(p);
+            const int8_t* k30 = (const int8_t*)k3.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = (int8_t)k00[k];
+                g00[1] = (int8_t)k10[k];
+                g00[2] = (int8_t)k20[k];
+                g00[3] = (int8_t)k30[k];
+
+                g00 += 4;
+            }
+        }
+    }
+#else
+    for (; q + 1 < outch; q += 2)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+
+        int8_t* g00 = kernel_tm.channel(q / 2);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const int8_t* k00 = (const int8_t*)k0.row(p);
+            const int8_t* k10 = (const int8_t*)k1.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+
+                g00 += 2;
+            }
+        }
+    }
+#endif
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+#if __riscv_vector
+        int8_t* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        int8_t* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        for (int p = 0; p < inch; p++)
+        {
+            const int8_t* k00 = (const int8_t*)k0.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = (int8_t)k00[k];
+
+                g00 += 1;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int8_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int8_t* sptr = (const int8_t*)img.row(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_int8_rvv(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/riscv/convolution_sgemm_packnto1_int8.h b/src/layer/riscv/convolution_sgemm_packnto1_int8.h
new file mode 100644
index 000000000000..d25b968c702a
--- /dev/null
+++ b/src/layer/riscv/convolution_sgemm_packnto1_int8.h
@@ -0,0 +1,720 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_packnto1_int8_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    const int packn = csrr_vlenb();
+    const size_t vl = vsetvl_e8m1(packn);
+
+    // Mat bottom_im2col(size, maxk, inch, 1u * packn, packn, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    Mat tmp;
+    if (size >= 8)
+        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator);
+    else if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator);
+    else if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 1u * packn, packn, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 8;
+
+            int8_t* tmpptr = tmp.channel(i / 8);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if C906
+                    for (int l = 0; l < packn; l++)
+                    {
+                        tmpptr[0] = img0[l];
+                        tmpptr[1] = img0[l + packn];
+                        tmpptr[2] = img0[l + packn * 2];
+                        tmpptr[3] = img0[l + packn * 3];
+                        tmpptr[4] = img0[l + packn * 4];
+                        tmpptr[5] = img0[l + packn * 5];
+                        tmpptr[6] = img0[l + packn * 6];
+                        tmpptr[7] = img0[l + packn * 7];
+                        tmpptr += 8;
+                    }
+
+                    img0 += size * packn;
+#else
+                    vint8m1_t _val0 = vle8_v_i8m1(img0, vl);
+                    vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl);
+                    vint8m1_t _val2 = vle8_v_i8m1(img0 + packn * 2, vl);
+                    vint8m1_t _val3 = vle8_v_i8m1(img0 + packn * 3, vl);
+                    vint8m1_t _val4 = vle8_v_i8m1(img0 + packn * 4, vl);
+                    vint8m1_t _val5 = vle8_v_i8m1(img0 + packn * 5, vl);
+                    vint8m1_t _val6 = vle8_v_i8m1(img0 + packn * 6, vl);
+                    vint8m1_t _val7 = vle8_v_i8m1(img0 + packn * 7, vl);
+                    vsseg8e8_v_i8m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
+
+                    img0 += size * packn;
+                    tmpptr += packn * 8;
+#endif
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 3;
+        nn_size = (size - remain_size_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
+
+            int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if C906
+                    for (int l = 0; l < packn; l++)
+                    {
+                        tmpptr[0] = img0[l];
+                        tmpptr[1] = img0[l + packn];
+                        tmpptr[2] = img0[l + packn * 2];
+                        tmpptr[3] = img0[l + packn * 3];
+                        tmpptr += 4;
+                    }
+
+                    img0 += size * packn;
+#else
+                    vint8m1_t _val0 = vle8_v_i8m1(img0, vl);
+                    vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl);
+                    vint8m1_t _val2 = vle8_v_i8m1(img0 + packn * 2, vl);
+                    vint8m1_t _val3 = vle8_v_i8m1(img0 + packn * 3, vl);
+                    vsseg4e8_v_i8m1(tmpptr, _val0, _val1, _val2, _val3, vl);
+
+                    img0 += size * packn;
+                    tmpptr += packn * 4;
+#endif
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 2;
+        nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if C906
+                    for (int l = 0; l < packn; l++)
+                    {
+                        tmpptr[0] = img0[l];
+                        tmpptr[1] = img0[l + packn];
+                        tmpptr += 2;
+                    }
+
+                    img0 += size * packn;
+#else
+                    vint8m1_t _val0 = vle8_v_i8m1(img0, vl);
+                    vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl);
+                    vsseg2e8_v_i8m1(tmpptr, _val0, _val1, vl);
+
+                    img0 += size * packn;
+                    tmpptr += packn * 2;
+#endif
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    vint8m1_t _val = vle8_v_i8m1(img0, vl);
+                    vse8_v_i8m1(tmpptr, _val, vl);
+
+                    img0 += size * packn;
+                    tmpptr += packn;
+                }
+            }
+        }
+    }
+
+    // TODO
+    int nn_outch = outch / packn;
+    int remain_outch_start = nn_outch * packn;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * packn;
+
+        int8_t* outptr0 = top_blob.channel(p);
+
+#ifdef __clang__
+        const int8_t* zeros = _zero_tmp;
+#else
+        const int8_t zeros[packn] = {0};
+#endif // __clang__
+        const int8_t* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8);
+            const int8_t* kptr0 = kernel.channel(p / packn);
+
+            int nn = inch * maxk * packn; // inch always > 0
+
+            vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum2 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum3 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum4 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum5 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum6 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum7 = vle8_v_i8m1(biasptr, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                int8_t val0 = *tmpptr++;
+                int8_t val1 = *tmpptr++;
+                int8_t val2 = *tmpptr++;
+                int8_t val3 = *tmpptr++;
+                int8_t val4 = *tmpptr++;
+                int8_t val5 = *tmpptr++;
+                int8_t val6 = *tmpptr++;
+                int8_t val7 = *tmpptr++;
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl);
+                _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl);
+                _sum2 = vmacc_vx_i8m1(_sum2, val2, _w0, vl);
+                _sum3 = vmacc_vx_i8m1(_sum3, val3, _w0, vl);
+                _sum4 = vmacc_vx_i8m1(_sum4, val4, _w0, vl);
+                _sum5 = vmacc_vx_i8m1(_sum5, val5, _w0, vl);
+                _sum6 = vmacc_vx_i8m1(_sum6, val6, _w0, vl);
+                _sum7 = vmacc_vx_i8m1(_sum7, val7, _w0, vl);
+
+                kptr0 += packn;
+            }
+
+#if C906
+            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl);
+            vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl);
+            vsse8_v_i8m1(outptr0 + 2, top_blob.cstep * sizeof(int8_t), _sum2, vl);
+            vsse8_v_i8m1(outptr0 + 3, top_blob.cstep * sizeof(int8_t), _sum3, vl);
+            vsse8_v_i8m1(outptr0 + 4, top_blob.cstep * sizeof(int8_t), _sum4, vl);
+            vsse8_v_i8m1(outptr0 + 5, top_blob.cstep * sizeof(int8_t), _sum5, vl);
+            vsse8_v_i8m1(outptr0 + 6, top_blob.cstep * sizeof(int8_t), _sum6, vl);
+            vsse8_v_i8m1(outptr0 + 7, top_blob.cstep * sizeof(int8_t), _sum7, vl);
+#else
+            vssseg8e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
+#endif
+            outptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
+            const int8_t* kptr0 = kernel.channel(p / packn);
+
+            int nn = inch * maxk * packn; // inch always > 0
+
+            vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum2 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum3 = vle8_v_i8m1(biasptr, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                int8_t val0 = *tmpptr++;
+                int8_t val1 = *tmpptr++;
+                int8_t val2 = *tmpptr++;
+                int8_t val3 = *tmpptr++;
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl);
+                _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl);
+                _sum2 = vmacc_vx_i8m1(_sum2, val2, _w0, vl);
+                _sum3 = vmacc_vx_i8m1(_sum3, val3, _w0, vl);
+
+                kptr0 += packn;
+            }
+
+#if C906
+            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl);
+            vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl);
+            vsse8_v_i8m1(outptr0 + 2, top_blob.cstep * sizeof(int8_t), _sum2, vl);
+            vsse8_v_i8m1(outptr0 + 3, top_blob.cstep * sizeof(int8_t), _sum3, vl);
+#else
+            vssseg4e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, _sum2, _sum3, vl);
+#endif
+            outptr0 += 4;
+        }
+        for (; i + 1 < size; i += 2)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
+            const int8_t* kptr0 = kernel.channel(p / packn);
+
+            int nn = inch * maxk * packn; // inch always > 0
+
+            vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl);
+            vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                int8_t val0 = *tmpptr++;
+                int8_t val1 = *tmpptr++;
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl);
+                _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl);
+
+                kptr0 += packn;
+            }
+
+#if C906
+            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl);
+            vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl);
+#else
+            vssseg2e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, vl);
+#endif
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
+            const int8_t* kptr0 = kernel.channel(p / packn);
+
+            int nn = inch * maxk * packn; // inch always > 0
+
+            vint8m1_t _sum = vle8_v_i8m1(biasptr, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                int8_t val = *tmpptr++;
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum = vmacc_vx_i8m1(_sum, val, _w0, vl);
+
+                kptr0 += packn;
+            }
+
+            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum, vl);
+
+            outptr0 += 1;
+        }
+    }
+#ifdef __clang__
+    delete[] _zero_tmp;
+#endif // __clang__
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int8_t* outptr0 = top_blob.channel(p);
+
+        const int8_t bias0 = bias ? bias[p] : 0;
+
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8);
+            const int8_t* kptr0 = kernel.channel(p / packn + p % packn);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int8_t sum0 = bias0;
+            int8_t sum1 = bias0;
+            int8_t sum2 = bias0;
+            int8_t sum3 = bias0;
+            int8_t sum4 = bias0;
+            int8_t sum5 = bias0;
+            int8_t sum6 = bias0;
+            int8_t sum7 = bias0;
+
+            vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum2 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum3 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum4 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum5 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum6 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum7 = vmv_v_x_i8m1(0, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                vint8m1_t _val0;
+                vint8m1_t _val1;
+                vint8m1_t _val2;
+                vint8m1_t _val3;
+                vint8m1_t _val4;
+                vint8m1_t _val5;
+                vint8m1_t _val6;
+                vint8m1_t _val7;
+                vlseg8e8_v_i8m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl);
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl);
+                _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl);
+                _sum2 = vmacc_vv_i8m1(_sum2, _val2, _w0, vl);
+                _sum3 = vmacc_vv_i8m1(_sum3, _val3, _w0, vl);
+                _sum4 = vmacc_vv_i8m1(_sum4, _val4, _w0, vl);
+                _sum5 = vmacc_vv_i8m1(_sum5, _val5, _w0, vl);
+                _sum6 = vmacc_vv_i8m1(_sum6, _val6, _w0, vl);
+                _sum7 = vmacc_vv_i8m1(_sum7, _val7, _w0, vl);
+                tmpptr += packn * 8;
+                kptr0 += packn;
+            }
+
+#if C906
+            // TODO
+            std::vector<int8_t> ss0(packn);
+            std::vector<int8_t> ss1(packn);
+            std::vector<int8_t> ss2(packn);
+            std::vector<int8_t> ss3(packn);
+            std::vector<int8_t> ss4(packn);
+            std::vector<int8_t> ss5(packn);
+            std::vector<int8_t> ss6(packn);
+            std::vector<int8_t> ss7(packn);
+            vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl);
+            vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl);
+            vse8_v_i8m1((int8_t*)ss2.data(), _sum2, vl);
+            vse8_v_i8m1((int8_t*)ss3.data(), _sum3, vl);
+            vse8_v_i8m1((int8_t*)ss4.data(), _sum4, vl);
+            vse8_v_i8m1((int8_t*)ss5.data(), _sum5, vl);
+            vse8_v_i8m1((int8_t*)ss6.data(), _sum6, vl);
+            vse8_v_i8m1((int8_t*)ss7.data(), _sum7, vl);
+            for (int i = 0; i < packn; i++)
+            {
+                sum0 += ss0[i];
+                sum1 += ss1[i];
+                sum2 += ss2[i];
+                sum3 += ss3[i];
+                sum4 += ss4[i];
+                sum5 += ss5[i];
+                sum6 += ss6[i];
+                sum7 += ss7[i];
+            }
+#else
+            sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl));
+            sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl));
+            sum2 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum2, vfmv_s_f_f16m1(vint8m1_t(), sum2, vl), vl));
+            sum3 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum3, vfmv_s_f_f16m1(vint8m1_t(), sum3, vl), vl));
+            sum4 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum4, vfmv_s_f_f16m1(vint8m1_t(), sum4, vl), vl));
+            sum5 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum5, vfmv_s_f_f16m1(vint8m1_t(), sum5, vl), vl));
+            sum6 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum6, vfmv_s_f_f16m1(vint8m1_t(), sum6, vl), vl));
+            sum7 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum7, vfmv_s_f_f16m1(vint8m1_t(), sum7, vl), vl));
+#endif
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+            outptr0[4] = sum4;
+            outptr0[5] = sum5;
+            outptr0[6] = sum6;
+            outptr0[7] = sum7;
+
+            outptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
+            const int8_t* kptr0 = kernel.channel(p / packn + p % packn);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int8_t sum0 = bias0;
+            int8_t sum1 = bias0;
+            int8_t sum2 = bias0;
+            int8_t sum3 = bias0;
+
+            vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum2 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum3 = vmv_v_x_i8m1(0, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                vint8m1_t _val0;
+                vint8m1_t _val1;
+                vint8m1_t _val2;
+                vint8m1_t _val3;
+
+                vlseg4e8_v_i8m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl);
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl);
+                _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl);
+                _sum2 = vmacc_vv_i8m1(_sum2, _val2, _w0, vl);
+                _sum3 = vmacc_vv_i8m1(_sum3, _val3, _w0, vl);
+                tmpptr += packn * 4;
+                kptr0 += packn;
+            }
+
+#if C906
+            // TODO
+            std::vector<int8_t> ss0(packn);
+            std::vector<int8_t> ss1(packn);
+            std::vector<int8_t> ss2(packn);
+            std::vector<int8_t> ss3(packn);
+            vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl);
+            vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl);
+            vse8_v_i8m1((int8_t*)ss2.data(), _sum2, vl);
+            vse8_v_i8m1((int8_t*)ss3.data(), _sum3, vl);
+            for (int i = 0; i < packn; i++)
+            {
+                sum0 += ss0[i];
+                sum1 += ss1[i];
+                sum2 += ss2[i];
+                sum3 += ss3[i];
+            }
+#else
+            sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl));
+            sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl));
+            sum2 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum2, vfmv_s_f_f16m1(vint8m1_t(), sum2, vl), vl));
+            sum3 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum3, vfmv_s_f_f16m1(vint8m1_t(), sum3, vl), vl));
+#endif
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+
+            outptr0 += 4;
+        }
+        for (; i + 1 < size; i += 2)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
+            const int8_t* kptr0 = kernel.channel(p / packn + p % packn);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int8_t sum0 = bias0;
+            int8_t sum1 = bias0;
+
+            vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl);
+            vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                vint8m1_t _val0;
+                vint8m1_t _val1;
+                vlseg2e8_v_i8m1(&_val0, &_val1, tmpptr, vl);
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl);
+                _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl);
+                tmpptr += packn * 2;
+                kptr0 += packn;
+            }
+
+#if C906
+            // TODO
+            std::vector<int8_t> ss0(packn);
+            std::vector<int8_t> ss1(packn);
+            vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl);
+            vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl);
+            for (int i = 0; i < packn; i++)
+            {
+                sum0 += ss0[i];
+                sum1 += ss1[i];
+            }
+#else
+            sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl));
+            sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl));
+#endif
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
+            const int8_t* kptr0 = kernel.channel(p / packn + p % packn);
+
+            int nn = inch * maxk; // inch always > 0
+
+            int8_t sum0 = bias0;
+
+            vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl);
+
+            for (int j = 0; j < nn; j++)
+            {
+                vint8m1_t _val0 = vle8_v_i8m1(tmpptr, vl);
+                vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl);
+                _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl);
+                tmpptr += packn;
+                kptr0 += packn;
+            }
+
+#if C906
+            // TODO
+            std::vector<int8_t> ss0(packn);
+            vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl);
+            for (int i = 0; i < packn; i++)
+            {
+                sum0 += ss0[i];
+            }
+#else
+            sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl));
+#endif
+
+            outptr0[0] = sum0;
+
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_packnto1_int8_rvv(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int packn = csrr_vlenb();
+
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = pb-pa-maxk-inch/pa-outch/pb
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(packn * packn * maxk, inch / packn, outch / packn + outch % packn, (size_t)1u);
+
+    int q = 0;
+    for (; q + (packn - 1) < outch; q += packn)
+    {
+        int8_t* g00 = kernel_tm.channel(q / packn);
+
+        for (int p = 0; p + (packn - 1) < inch; p += packn)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < packn; i++)
+                {
+                    for (int j = 0; j < packn; j++)
+                    {
+                        const float* k00 = kernel.channel(q + j).row(p + i);
+
+                        g00[0] = (int8_t)k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+        int8_t* g00 = kernel_tm.channel(q / packn + q % packn);
+
+        for (int p = 0; p + (packn - 1) < inch; p += packn)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < packn; j++)
+                {
+                    const float* k00 = k0.row(p + j);
+
+                    g00[0] = (int8_t)k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_packnto1_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    const int packn = csrr_vlenb();
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u * packn, packn, opt.workspace_allocator);
+    {
+        const int gap = (w * stride_h - outw * stride_w) * packn;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int8_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int8_t* sptr = img.row<const int8_t>(dilation_h * u) + dilation_w * v * packn;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            vint8m1_t _val = vle8_v_i8m1(sptr, vl);
+                            vse8_v_i8m1(ptr, _val, vl);
+
+                            sptr += stride_w * packn;
+                            ptr += packn;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_packnto1_int8_rvv(bottom_im2col, top_blob, kernel, _bias, opt);
+}

From 57b961a529a26051f57d30400ebf3259419e938f Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Mon, 25 Dec 2023 16:22:00 +0800
Subject: [PATCH 04/10] innerproduct int8 riscv rvv

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/innerproduct_riscv.cpp | 154 +++++++++++++++++++++++--
 1 file changed, 145 insertions(+), 9 deletions(-)

diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index c0d22817710a..d3a8c15cf978 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -1140,6 +1140,9 @@ int InnerProduct_riscv::create_pipeline_int8_riscv(const Option& opt)
 
 int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+#if __riscv_vector
+    const int packn = csrr_vlenb();
+#endif
     const int num_input = weight_data_size / num_output;
 
     int elembits = bottom_blob.elembits();
@@ -1178,8 +1181,46 @@ int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob
             for (int j = 0; j < outh; j++)
             {
                 float* outptr = top_blob.row(j);
+#if __riscv_vector
+                int nn_num_output = num_output / packn ? num_output / packn - 1 : 0;
+                int remain_num_output_start = nn_num_output * packn;
+                
+                for (int pp = 0; pp < nn_num_output; pp++)
+                {
+                    int p = pp * packn;
 
-                for (int p = 0; p < num_output; p++)
+                    size_t vl = vsetvl_e8m1(packn);
+                    vint32m4_t _sum = vmv_v_x_i32m4(0, vl);
+
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        vint16m2_t _kptr = vwcvt_x_x_v_i16m2(vlse8_v_i8m1(kptr, num_input, vl), vl);
+                        _sum = vwmacc_vx_i32m4(_sum, *m, _kptr, vl);
+                        m++;
+                        kptr++;
+                    }
+
+                    vfloat32m4_t _sumfp32;
+                    if (bias_term)
+                        _sumfp32 = vle32_v_f32m4((const float *)bias_data + p, vl);
+                    else
+                        _sumfp32 = vfmv_v_f_f32m4(0.f, vl);
+
+                    _sumfp32 = vfmacc_vv_f32m4(_sumfp32, vreinterpret_v_i32m4_f32m4(_sum),
+                         vle32_v_f32m4((const float *)scale_in_data + p, vl), vl);
+                    
+                    _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+                    vse32_v_f32m4((float*)outptr, _sumfp32, vl);
+                }
+#else
+                int remain_num_output_start = 0;
+#endif
+
+                for (int p = remain_num_output_start; p < num_output; p++)
                 {
                     const signed char* kptr = weight_data_tm.row<const signed char>(p);
                     const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
@@ -1226,22 +1267,117 @@ int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob
 
     if (out_elempack == 1)
     {
+#if __riscv_vector
+        int nn_num_output = num_output / packn ? num_output / packn - 1 : 0;
+        int remain_num_output_start = nn_num_output * packn;
+
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * packn;
+
+            size_t vl = vsetvl_e8m1(packn);
+            vint32m4_t _sum = vmv_v_x_i32m4(0, vl);
+
+            const signed char* w = weight_data_tm.row<const signed char>(p);
+
+            const signed char* m = bottom_blob_int8_flattened;
+
+            int n = num_input;
+            while (n > 0)
+            {
+                vint16m2_t _w = vwcvt_x_x_v_i16m2(vlse8_v_i8m1(w, num_input, vl), vl);
+                _sum = vwmacc_vx_i32m4(_sum, *m, _w, vl);
+
+                m += 1;
+                w += 1;
+                n -= 1;
+            }
+
+            vfloat32m4_t sumfp32;
+            if (bias_term)
+                sumfp32 = vle32_v_f32m4((const float *)bias_data + p, vl);
+            else
+                sumfp32 = vfmv_v_f_f32m4(0.f, vl);
+
+            sumfp32 = vfmacc_vv_f32m4(sumfp32, vreinterpret_v_i32m4_f32m4(_sum),
+                 vle32_v_f32m4((const float *)scale_in_data + p, vl), vl);
+
+            sumfp32 = activation_ps(sumfp32, activation_type, activation_params, vl);
+
+            vse32_v_f32m4((float*)top_blob + p, sumfp32, vl);
+        }
+#else
+        int nn_num_output = num_output / 4;
+        int remain_num_output_start = nn_num_output * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * 4;
+
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
+
+            const signed char* w0 = weight_data_tm.row<const signed char>(p);
+            const signed char* w1 = weight_data_tm.row<const signed char>(p + 1);
+            const signed char* w2 = weight_data_tm.row<const signed char>(p + 2);
+            const signed char* w3 = weight_data_tm.row<const signed char>(p + 3);
+
+            const signed char* m = bottom_blob_int8_flattened;
+
+            for (int i = 0; i < num_input; i++)
+            {
+                sum0 += *m * *w0;
+                sum1 += *m * *w1;
+                sum2 += *m * *w2;
+                sum3 += *m * *w3;
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+            
+            float sumfp32_0 = sum0 * scale_in_data[p];
+            float sumfp32_1 = sum1 * scale_in_data[p + 1];
+            float sumfp32_2 = sum2 * scale_in_data[p + 2];
+            float sumfp32_3 = sum3 * scale_in_data[p + 3];
+
+            if (bias_term)
+            {
+                sumfp32_0 += bias_data[p];
+                sumfp32_1 += bias_data[p + 1];
+                sumfp32_2 += bias_data[p + 2];
+                sumfp32_3 += bias_data[p + 3];
+            }
+
+            sumfp32_0 = activation_ss(sumfp32_0, activation_type, activation_params);
+            sumfp32_1 = activation_ss(sumfp32_1, activation_type, activation_params);
+            sumfp32_2 = activation_ss(sumfp32_2, activation_type, activation_params);
+            sumfp32_3 = activation_ss(sumfp32_3, activation_type, activation_params);
+
+            top_blob[p] = sumfp32_0;
+            top_blob[p + 1] = sumfp32_1;
+            top_blob[p + 2] = sumfp32_2;
+            top_blob[p + 3] = sumfp32_3;
+        }
+#endif // __riscv_vector
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_num_output_start; p < num_output; p++)
         {
             int sum = 0;
 
             const signed char* kptr = weight_data_tm.row<const signed char>(p);
             const signed char* sptr = bottom_blob_int8_flattened;
 
-            int i = 0;
-            for (; i < num_input; i++)
+            for (int i = 0; i < num_input; i++)
             {
-                signed char val = sptr[0];
-
-                signed char w = kptr[0];
-
-                sum += val * w;
+                sum += *sptr * *kptr;
 
                 sptr += 1;
                 kptr += 1;

From 2dacf8043f119bc9ee48b39afeb430034236e67b Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Thu, 8 Feb 2024 21:55:14 +0800
Subject: [PATCH 05/10] riscv quantize packing

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/quantize_riscv.cpp | 688 +++++++++++++++++++++++++++++
 src/layer/riscv/quantize_riscv.h   |  32 ++
 2 files changed, 720 insertions(+)
 create mode 100644 src/layer/riscv/quantize_riscv.cpp
 create mode 100644 src/layer/riscv/quantize_riscv.h

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
new file mode 100644
index 000000000000..172c7d45ab3d
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -0,0 +1,688 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_riscv.h"
+
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+Quantize_riscv::Quantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    int packn = csrr_vlenb() / 4;
+    int out_packn = packn * 4;
+    size_t vl = vsetvl_e32m4(packn);
+#endif
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack == packn)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % out_packn == 0 ? out_packn : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % out_packn == 0 ? out_packn : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == out_packn)
+            {
+                if (scale_data_size == 1)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 4);
+                        const float* ptr1 = bottom_blob.row(i * 4 + 1);
+                        const float* ptr2 = bottom_blob.row(i * 4 + 2);
+                        const float* ptr3 = bottom_blob.row(i * 4 + 3);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl);
+                            vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl);
+                            vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl);
+                            _ptr0 = vfmul_vf_f32m4(_ptr0, scale_data[0], vl);
+                            _ptr1 = vfmul_vf_f32m4(_ptr1, scale_data[0], vl);
+                            _ptr2 = vfmul_vf_f32m4(_ptr2, scale_data[0], vl);
+                            _ptr3 = vfmul_vf_f32m4(_ptr3, scale_data[0], vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vint8m1_t out1 = float2int8(_ptr1, vl);
+                            vint8m1_t out2 = float2int8(_ptr2, vl);
+                            vint8m1_t out3 = float2int8(_ptr3, vl);
+                            vse8_v_i8m1(outptr, out0, vl);
+                            vse8_v_i8m1(outptr + packn, out1, vl);
+                            vse8_v_i8m1(outptr + 2 * packn, out2, vl);
+                            vse8_v_i8m1(outptr + 3 * packn, out3, vl);
+
+                            ptr0 += packn;
+                            ptr1 += packn;
+                            ptr2 += packn;
+                            ptr3 += packn;
+                            outptr += out_packn;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 4);
+                        const float* ptr1 = bottom_blob.row(i * 4 + 1);
+                        const float* ptr2 = bottom_blob.row(i * 4 + 2);
+                        const float* ptr3 = bottom_blob.row(i * 4 + 3);
+                        signed char* outptr = top_blob.row<signed char>(i);
+                        vfloat32m4_t _scale0 = vle32_v_f32m4((const float*)scale_data + 4 * i * packn, vl);
+                        vfloat32m4_t _scale1 = vle32_v_f32m4((const float*)scale_data + (4 * i + 1) * packn, vl);
+                        vfloat32m4_t _scale2 = vle32_v_f32m4((const float*)scale_data + (4 * i + 2) * packn, vl);
+                        vfloat32m4_t _scale3 = vle32_v_f32m4((const float*)scale_data + (4 * i + 3) * packn, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl);
+                            vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl);
+                            vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl);
+                            _ptr0 = vfmul_vv_f32m4(_ptr0, _scale0, vl);
+                            _ptr1 = vfmul_vv_f32m4(_ptr1, _scale1, vl);
+                            _ptr2 = vfmul_vv_f32m4(_ptr2, _scale2, vl);
+                            _ptr3 = vfmul_vv_f32m4(_ptr3, _scale3, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vint8m1_t out1 = float2int8(_ptr1, vl);
+                            vint8m1_t out2 = float2int8(_ptr2, vl);
+                            vint8m1_t out3 = float2int8(_ptr3, vl);
+                            vse8_v_i8m1(outptr, out0, vl);
+                            vse8_v_i8m1(outptr + packn, out1, vl);
+                            vse8_v_i8m1(outptr + 2 * packn, out2, vl);
+                            vse8_v_i8m1(outptr + 3 * packn, out3, vl);
+
+                            ptr0 += packn;
+                            ptr1 += packn;
+                            ptr2 += packn;
+                            ptr3 += packn;
+                            outptr += out_packn;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * packn);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vsse8_v_i8m1(outptr0, top_blob.w * sizeof(int8_t), out0, vl);
+
+                            ptr0 += packn;
+                            outptr0 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * packn);
+
+                        vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            _ptr0 = vfmul_vv_f32m4(_ptr0, _scale, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vsse8_v_i8m1(outptr0, top_blob.w * sizeof(int8_t), out0, vl);
+
+                            ptr0 += packn;
+                            outptr0 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % out_packn == 0 ? out_packn : 1;
+            int outc = channels * elempack / out_elempack;
+            NCNN_LOGE("out_elempack:%d", out_elempack);
+            NCNN_LOGE("outc:%d", outc);
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == out_packn)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 4);
+                        const float* ptr1 = bottom_blob.channel(q * 4 + 1);
+                        const float* ptr2 = bottom_blob.channel(q * 4 + 2);
+                        const float* ptr3 = bottom_blob.channel(q * 4 + 3);
+                        signed char* outptr = top_blob.channel(q);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl);
+                            vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl);
+                            vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl);
+                            _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl);
+                            _ptr1 = vfmul_vf_f32m4(_ptr1, scale, vl);
+                            _ptr2 = vfmul_vf_f32m4(_ptr2, scale, vl);
+                            _ptr3 = vfmul_vf_f32m4(_ptr3, scale, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vint8m1_t out1 = float2int8(_ptr1, vl);
+                            vint8m1_t out2 = float2int8(_ptr2, vl);
+                            vint8m1_t out3 = float2int8(_ptr3, vl);
+                            vse8_v_i8m1(outptr, out0, vl);
+                            vse8_v_i8m1(outptr + packn, out1, vl);
+                            vse8_v_i8m1(outptr + 2 * packn, out2, vl);
+                            vse8_v_i8m1(outptr + 3 * packn, out3, vl);
+
+                            ptr0 += packn;
+                            ptr1 += packn;
+                            ptr2 += packn;
+                            ptr3 += packn;
+                            outptr += out_packn;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 4);
+                        const float* ptr1 = bottom_blob.channel(q * 4 + 1);
+                        const float* ptr2 = bottom_blob.channel(q * 4 + 2);
+                        const float* ptr3 = bottom_blob.channel(q * 4 + 3);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vfloat32m4_t _scale0 = vle32_v_f32m4((const float*)scale_data + q * 4 * packn, vl);
+                        vfloat32m4_t _scale1 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl);
+                        vfloat32m4_t _scale2 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl);
+                        vfloat32m4_t _scale3 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl);
+
+                        int i = 0;
+                        for (; i < size; i++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl);
+                            vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl);
+                            vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl);
+                            _ptr0 = vfmul_vv_f32m4(_ptr0, _scale0, vl);
+                            _ptr1 = vfmul_vv_f32m4(_ptr1, _scale1, vl);
+                            _ptr2 = vfmul_vv_f32m4(_ptr2, _scale2, vl);
+                            _ptr3 = vfmul_vv_f32m4(_ptr3, _scale3, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vint8m1_t out1 = float2int8(_ptr1, vl);
+                            vint8m1_t out2 = float2int8(_ptr2, vl);
+                            vint8m1_t out3 = float2int8(_ptr3, vl);
+                            vse8_v_i8m1(outptr, out0, vl);
+                            vse8_v_i8m1(outptr + packn, out1, vl);
+                            vse8_v_i8m1(outptr + 2 * packn, out2, vl);
+                            vse8_v_i8m1(outptr + 3 * packn, out3, vl);
+
+                            ptr0 += packn;
+                            ptr1 += packn;
+                            ptr2 += packn;
+                            ptr3 += packn;
+                            outptr += out_packn;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * packn);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), out0, vl);
+
+                            ptr0 += packn;
+                            outptr0 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * packn);
+
+                        vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + q * packn, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl);
+                            _ptr0 = vfmul_vv_f32m4(_ptr0, _scale, vl);
+                            vint8m1_t out0 = float2int8(_ptr0, vl);
+                            vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), out0, vl);
+
+                            ptr0 += packn;
+                            outptr0 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+        else
+        {
+            const float* scaleptr = scale_data;
+#if __riscv_vector
+            int num_nn = w / (packn * 8);
+            int remain_w_start = num_nn * packn * 8;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl);
+                vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl);
+                vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl);
+                vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl);
+                vfloat32m4_t _scale0 = vle32_v_f32m4(scaleptr, vl);
+                vfloat32m4_t _scale1 = vle32_v_f32m4(scaleptr + packn, vl);
+                vfloat32m4_t _scale2 = vle32_v_f32m4(scaleptr + 2 * packn, vl);
+                vfloat32m4_t _scale3 = vle32_v_f32m4(scaleptr + 3 * packn, vl);
+                vfloat32m4_t _scale4 = vle32_v_f32m4(scaleptr + 4 * packn, vl);
+                vfloat32m4_t _scale5 = vle32_v_f32m4(scaleptr + 5 * packn, vl);
+                vfloat32m4_t _scale6 = vle32_v_f32m4(scaleptr + 6 * packn, vl);
+                vfloat32m4_t _scale7 = vle32_v_f32m4(scaleptr + 7 * packn, vl);
+                _p0 = vfmul_vv_f32m4(_p0, _scale0, vl);
+                _p1 = vfmul_vv_f32m4(_p1, _scale1, vl);
+                _p2 = vfmul_vv_f32m4(_p2, _scale2, vl);
+                _p3 = vfmul_vv_f32m4(_p3, _scale3, vl);
+                _p4 = vfmul_vv_f32m4(_p4, _scale4, vl);
+                _p5 = vfmul_vv_f32m4(_p5, _scale5, vl);
+                _p6 = vfmul_vv_f32m4(_p6, _scale6, vl);
+                _p7 = vfmul_vv_f32m4(_p7, _scale7, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vint8m1_t _outp4 = float2int8(_p4, vl);
+                vint8m1_t _outp5 = float2int8(_p5, vl);
+                vint8m1_t _outp6 = float2int8(_p6, vl);
+                vint8m1_t _outp7 = float2int8(_p7, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                vse8_v_i8m1(outptr + 4 * packn, _outp4, vl);
+                vse8_v_i8m1(outptr + 5 * packn, _outp5, vl);
+                vse8_v_i8m1(outptr + 6 * packn, _outp6, vl);
+                vse8_v_i8m1(outptr + 7 * packn, _outp7, vl);
+                ptr += 8 * packn;
+                outptr += 8 * packn;
+                scaleptr += 8 * packn;
+            }
+
+            num_nn = (w - remain_w_start) / (packn * 4);
+            remain_w_start += num_nn * packn * 4;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                vfloat32m4_t _scale0 = vle32_v_f32m4(scaleptr, vl);
+                vfloat32m4_t _scale1 = vle32_v_f32m4(scaleptr + packn, vl);
+                vfloat32m4_t _scale2 = vle32_v_f32m4(scaleptr + 2 * packn, vl);
+                vfloat32m4_t _scale3 = vle32_v_f32m4(scaleptr + 3 * packn, vl);
+                _p0 = vfmul_vv_f32m4(_p0, _scale0, vl);
+                _p1 = vfmul_vv_f32m4(_p1, _scale1, vl);
+                _p2 = vfmul_vv_f32m4(_p2, _scale2, vl);
+                _p3 = vfmul_vv_f32m4(_p3, _scale3, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                ptr += 4 * packn;
+                outptr += 4 * packn;
+                scaleptr += 4 * packn;
+            }
+#else
+            int remain_w_start = 0;
+#endif
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_w_start; i < w; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * *scaleptr++);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr = bottom_blob.row(i);
+            signed char* outptr = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+#if __riscv_vector
+            int num_nn = w / (packn * 8);
+            int remain_w_start = num_nn * packn * 8;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl);
+                vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl);
+                vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl);
+                vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl);
+                _p0 = vfmul_vf_f32m4(_p0, scale, vl);
+                _p1 = vfmul_vf_f32m4(_p1, scale, vl);
+                _p2 = vfmul_vf_f32m4(_p2, scale, vl);
+                _p3 = vfmul_vf_f32m4(_p3, scale, vl);
+                _p4 = vfmul_vf_f32m4(_p4, scale, vl);
+                _p5 = vfmul_vf_f32m4(_p5, scale, vl);
+                _p6 = vfmul_vf_f32m4(_p6, scale, vl);
+                _p7 = vfmul_vf_f32m4(_p7, scale, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vint8m1_t _outp4 = float2int8(_p4, vl);
+                vint8m1_t _outp5 = float2int8(_p5, vl);
+                vint8m1_t _outp6 = float2int8(_p6, vl);
+                vint8m1_t _outp7 = float2int8(_p7, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                vse8_v_i8m1(outptr + 4 * packn, _outp4, vl);
+                vse8_v_i8m1(outptr + 5 * packn, _outp5, vl);
+                vse8_v_i8m1(outptr + 6 * packn, _outp6, vl);
+                vse8_v_i8m1(outptr + 7 * packn, _outp7, vl);
+                ptr += 8 * packn;
+                outptr += 8 * packn;
+            }
+
+            num_nn = (w - remain_w_start) / (packn * 4);
+            remain_w_start += num_nn * packn * 4;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                _p0 = vfmul_vf_f32m4(_p0, scale, vl);
+                _p1 = vfmul_vf_f32m4(_p1, scale, vl);
+                _p2 = vfmul_vf_f32m4(_p2, scale, vl);
+                _p3 = vfmul_vf_f32m4(_p3, scale, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                ptr += 4 * packn;
+                outptr += 4 * packn;
+            }
+#else
+            int remain_w_start = 0;
+#endif
+            for (int j = remain_w_start; j < w; j++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+#if __riscv_vector
+            int num_nn = w / (packn * 8);
+            int remain_w_start = num_nn * packn * 8;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl);
+                vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl);
+                vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl);
+                vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl);
+                _p0 = vfmul_vf_f32m4(_p0, scale, vl);
+                _p1 = vfmul_vf_f32m4(_p1, scale, vl);
+                _p2 = vfmul_vf_f32m4(_p2, scale, vl);
+                _p3 = vfmul_vf_f32m4(_p3, scale, vl);
+                _p4 = vfmul_vf_f32m4(_p4, scale, vl);
+                _p5 = vfmul_vf_f32m4(_p5, scale, vl);
+                _p6 = vfmul_vf_f32m4(_p6, scale, vl);
+                _p7 = vfmul_vf_f32m4(_p7, scale, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vint8m1_t _outp4 = float2int8(_p4, vl);
+                vint8m1_t _outp5 = float2int8(_p5, vl);
+                vint8m1_t _outp6 = float2int8(_p6, vl);
+                vint8m1_t _outp7 = float2int8(_p7, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                vse8_v_i8m1(outptr + 4 * packn, _outp4, vl);
+                vse8_v_i8m1(outptr + 5 * packn, _outp5, vl);
+                vse8_v_i8m1(outptr + 6 * packn, _outp6, vl);
+                vse8_v_i8m1(outptr + 7 * packn, _outp7, vl);
+                ptr += 8 * packn;
+                outptr += 8 * packn;
+            }
+
+            num_nn = (w - remain_w_start) / (packn * 4);
+            remain_w_start += num_nn * packn * 4;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < num_nn; i++)
+            {
+                vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl);
+                vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl);
+                vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl);
+                vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl);
+                _p0 = vfmul_vf_f32m4(_p0, scale, vl);
+                _p1 = vfmul_vf_f32m4(_p1, scale, vl);
+                _p2 = vfmul_vf_f32m4(_p2, scale, vl);
+                _p3 = vfmul_vf_f32m4(_p3, scale, vl);
+                vint8m1_t _outp0 = float2int8(_p0, vl);
+                vint8m1_t _outp1 = float2int8(_p1, vl);
+                vint8m1_t _outp2 = float2int8(_p2, vl);
+                vint8m1_t _outp3 = float2int8(_p3, vl);
+                vse8_v_i8m1(outptr, _outp0, vl);
+                vse8_v_i8m1(outptr + packn, _outp1, vl);
+                vse8_v_i8m1(outptr + 2 * packn, _outp2, vl);
+                vse8_v_i8m1(outptr + 3 * packn, _outp3, vl);
+                ptr += 4 * packn;
+                outptr += 4 * packn;
+            }
+#else
+            int remain_w_start = 0;
+#endif
+            for (int i = remain_w_start; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h
new file mode 100644
index 000000000000..c91c93b17bfa
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_RISCV_H
+#define LAYER_QUANTIZE_RISCV_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_riscv : virtual public Quantize
+{
+public:
+    Quantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_RISCV_H

From 50425e16a0e0949d95418a0430d47f2dea71d4c8 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Fri, 9 Feb 2024 12:12:28 +0800
Subject: [PATCH 06/10] riscv requantize packing

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/requantize_riscv.cpp | 683 +++++++++++++++++++++++++++
 src/layer/riscv/requantize_riscv.h   |  32 ++
 src/layer/riscv/riscv_usability.h    |  51 ++
 tests/test_dequantize.cpp            |   4 +
 tests/test_requantize.cpp            |   4 +
 tests/testutil.h                     |   4 +
 6 files changed, 778 insertions(+)
 create mode 100644 src/layer/riscv/requantize_riscv.cpp
 create mode 100644 src/layer/riscv/requantize_riscv.h

diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
new file mode 100644
index 000000000000..220087691d54
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -0,0 +1,683 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_riscv.h"
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+Requantize_riscv::Requantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    int packn = csrr_vlenb();
+    size_t vl = vsetvl_e32m4(packn);
+#endif
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack != packn && elempack != 1)
+    {
+        Mat bottom_blob_unpacked;
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt);
+        return forward(bottom_blob_unpacked, top_blob, opt);
+    }
+
+    if (elempack == packn)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                const float scale_in = scale_in_data[0];
+                const float scale_out = scale_out_data[0];
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vf_f32m4(_v, scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                const float scale_in = scale_in_data[0];
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vf_f32m4(_v, scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                const float scale_out = scale_out_data[0];
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vv_f32m4(_v, _scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vv_f32m4(_v, _scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        signed char* ptr = (signed char*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                        vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                    vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vv_f32m4(_v, scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+
+                        intptr += packn;
+                        ptr += packn;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
+                    vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
+                    vfloat32m4_t bias = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+
+                        intptr += packn;
+                        ptr += packn;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            // if (activation_type == 1)
+            // {
+            //     requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+            //     return 0;
+            // }
+
+            // if (activation_type == 2 && activation_params[0] > 0.f)
+            // {
+            //     requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+            //     return 0;
+            // }
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + q * packn, vl);
+                    vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + q * packn, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmul_vv_f32m4(_v, scale_in, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+
+                        intptr += packn;
+                        ptr += packn;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + q * packn, vl);
+                    vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + q * packn, vl);
+                    vfloat32m4_t bias = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + q * packn, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
+                        vse8_v_i8m1(ptr, _out, vl);
+
+                        intptr += packn;
+                        ptr += packn;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        signed char* ptr = top_blob;
+
+        if (scale_in_data_size == 1 && scale_out_data_size == 1)
+        {
+            const float scale_in = scale_in_data[0];
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+        {
+            const float scale_in = scale_in_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+        else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+        {
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in + bias;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+
+#if __riscv_vector
+                int num_nn = size / (packn * 2);
+                int remain_i_start = num_nn * packn * 2;
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < num_nn; i++)
+                {
+                    vfloat32m4_t _p0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                    vfloat32m4_t _p1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
+                    _p0 = activation_ps(vfmul_vf_f32m4(_p0, scale_in, vl), activation_type, activation_params, vl);
+                    _p1 = activation_ps(vfmul_vf_f32m4(_p1, scale_in, vl), activation_type, activation_params, vl);
+                    vint8m1_t _outp0 = float2int8(vfmul_vf_f32m4(_p0, scale_out, vl), vl);
+                    vint8m1_t _outp1 = float2int8(vfmul_vf_f32m4(_p1, scale_out, vl), vl);
+                    vse8_v_i8m1(ptr, _outp0, vl);
+                    vse8_v_i8m1(ptr + packn, _outp1, vl);
+                    ptr += packn * 2;
+                    intptr += packn * 2;
+                }
+#else 
+                int remain_i_start = 0;
+#endif
+                for (int i = remain_i_start; i < size; i++)
+                {
+                    float v = *intptr * scale_in;
+                    *ptr = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                    intptr++;
+                    ptr++;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h
new file mode 100644
index 000000000000..265c54a6be82
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_RISCV_H
+#define LAYER_REQUANTIZE_RISCV_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_riscv : virtual public Requantize
+{
+public:
+    Requantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_RISCV_H
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index e2824646f871..1e1ba95ae80f 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -50,6 +50,26 @@ static inline int csrr_vlenb()
     return a;
 }
 
+static inline int fcsr_frrm()
+{
+    int a = 0;
+    asm volatile("frrm %0"
+                 : "=r"(a)
+                 :
+                 : "memory");
+    return a;
+}
+
+static inline int fcsr_fsrm(int frm)
+{
+    int a = 0;
+    asm volatile("fsrm %0, %1"
+                 : "=r"(a)
+                 : "r"(frm)
+                 : "memory");
+    return a;
+}
+
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;
@@ -615,5 +635,36 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
     _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
 }
 #endif
+#if NCNN_INT8
+#if __riscv_vector
+static inline vint8m1_t float2int8(vfloat32m4_t _v, size_t vl) 
+{ 
+    int a = fcsr_fsrm(4);
+    vint16m2_t _vi16 = vfncvt_x_f_w_i16m2(_v, vl);
+    fcsr_fsrm(a);
+    _vi16 = vmax_vx_i16m2(_vi16, -127, vl);
+    _vi16 = vmin_vx_i16m2(_vi16, 127, vl);
+    return vncvt_x_x_w_i8m1(_vi16, vl);
+}
+
+static inline vint8m2_t float2int8(vfloat32m8_t _v, size_t vl) 
+{ 
+    int a = fcsr_fsrm(4);
+    vint16m4_t _vi16 = vfncvt_x_f_w_i16m4(_v, vl);
+    fcsr_fsrm(a);
+    _vi16 = vmax_vx_i16m4(_vi16, -127, vl);
+    _vi16 = vmin_vx_i16m4(_vi16, 127, vl);
+    return vncvt_x_x_w_i8m2(_vi16, vl);
+}
+#endif // __riscv_vector
+
+static inline signed char float2int8(float v)
+{
+    int int32 = round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+#endif // NCNN_INT8
 
 #endif // RISCV_USABILITY_H
diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp
index ca05059fa450..9bb76b786856 100644
--- a/tests/test_dequantize.cpp
+++ b/tests/test_dequantize.cpp
@@ -46,7 +46,11 @@ static int test_dequantize_pack8(const ncnn::Mat& a, int scale_data_size, int bi
     if (bias_data_size)
         weights[1] = RandomMat(bias_data_size);
 
+#if NCNN_RVV
+    int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB;
+#else
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
+#endif
     int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp
index 1032d529ea67..017b0645a872 100644
--- a/tests/test_requantize.cpp
+++ b/tests/test_requantize.cpp
@@ -79,7 +79,11 @@ static int test_requantize_pack8(const ncnn::Mat& a, int scale_in_data_size, int
     Randomize(weights[0], 0.0001, 0.001);
     Randomize(weights[1], 10, 100);
 
+#if NCNN_RVV
+    int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB;
+#else
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
+#endif
     int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag);
     if (ret != 0)
     {
diff --git a/tests/testutil.h b/tests/testutil.h
index 12f9d0daa654..614b294044f1 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -28,6 +28,10 @@
 #define TEST_LAYER_DISABLE_GPU_TESTING        (1 << 2)
 #define TEST_LAYER_ENABLE_FORCE_INPUT_PACK8   (1 << 3)
 
+#if NCNN_RVV
+#define TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB (1 << 4)
+#endif // NCNN_RVV
+
 void SRAND(int seed);
 
 uint64_t RAND();

From cc2439714791a67a4150141e885308ddd6b5e303 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Fri, 9 Feb 2024 15:19:19 +0800
Subject: [PATCH 07/10] riscv dequantize packing

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/dequantize_riscv.cpp | 565 +++++++++++++++++++++++++++
 src/layer/riscv/dequantize_riscv.h   |  32 ++
 2 files changed, 597 insertions(+)
 create mode 100644 src/layer/riscv/dequantize_riscv.cpp
 create mode 100644 src/layer/riscv/dequantize_riscv.h

diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
new file mode 100644
index 000000000000..9a172a414052
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -0,0 +1,565 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_riscv.h"
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+Dequantize_riscv::Dequantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif
+}
+
+int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    int packn = csrr_vlenb() / 4;
+    int in_packn = packn * 4;
+    size_t vl = vsetvl_e32m4(packn);
+#endif
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack != in_packn && elempack != 1)
+    {
+        Mat bottom_blob_unpacked;
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt);
+        return forward(bottom_blob_unpacked, top_blob, opt);
+    }
+
+    if (elempack == in_packn)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * 4;
+
+            top_blob.create(outw, (size_t)4u * packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmul_vf_f32m4(_v, scale, vl);
+                        vse32_v_f32m4(ptr, _v, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        vse32_v_f32m4(ptr, _v, vl);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        vse32_v_f32m4(ptr, _v, vl);
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmul_vv_f32m4(_v, _scale, vl);
+                        vse32_v_f32m4(ptr, _v, vl);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);       
+                        vse32_v_f32m4(ptr, _v, vl);                 
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * packn;
+                        float* ptr = (float*)top_blob + i * packn;
+
+                        vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
+                        vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
+                        vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);
+                        vse32_v_f32m4(ptr, _v, vl);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * 4;
+
+            top_blob.create(w, outh, (size_t)4u * packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 4);
+                    float* ptr1 = top_blob.row(i * 4 + 1);
+                    float* ptr2 = top_blob.row(i * 4 + 2);
+                    float* ptr3 = top_blob.row(i * 4 + 3);
+
+                    vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4) * packn, vl);
+                    vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 1) * packn, vl);
+                    vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 2) * packn, vl);
+                    vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 3) * packn, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
+                        vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
+                        vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
+                        _v0 = vfmul_vv_f32m4(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m4(_v1, _scale1, vl);
+                        _v2 = vfmul_vv_f32m4(_v2, _scale2, vl);
+                        _v3 = vfmul_vv_f32m4(_v3, _scale3, vl);
+                        vse32_v_f32m4(ptr0, _v0, vl);
+                        vse32_v_f32m4(ptr1, _v1, vl);
+                        vse32_v_f32m4(ptr2, _v2, vl);
+                        vse32_v_f32m4(ptr3, _v3, vl);
+
+                        intptr += in_packn;
+                        ptr0 += packn;
+                        ptr1 += packn;
+                        ptr2 += packn;
+                        ptr3 += packn;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 4);
+                    float* ptr1 = top_blob.row(i * 4 + 1);
+                    float* ptr2 = top_blob.row(i * 4 + 2);
+                    float* ptr3 = top_blob.row(i * 4 + 3);
+
+                    vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4) * packn, vl);
+                    vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 1) * packn, vl);
+                    vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 2) * packn, vl);
+                    vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 3) * packn, vl);
+                    vfloat32m4_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4) * packn, vl);
+                    vfloat32m4_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 1) * packn, vl);
+                    vfloat32m4_t _bias2 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 2) * packn, vl);
+                    vfloat32m4_t _bias3 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 3) * packn, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
+                        vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
+                        vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
+                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
+                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
+                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        vse32_v_f32m4(ptr0, _v0, vl);
+                        vse32_v_f32m4(ptr1, _v1, vl);
+                        vse32_v_f32m4(ptr2, _v2, vl);
+                        vse32_v_f32m4(ptr3, _v3, vl);
+
+                        intptr += in_packn;
+                        ptr0 += packn;
+                        ptr1 += packn;
+                        ptr2 += packn;
+                        ptr3 += packn;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * 4;
+
+            top_blob.create(w, h, outc, (size_t)4u * packn, packn, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 4);
+                    float* ptr1 = top_blob.channel(q * 4 + 1);
+                    float* ptr2 = top_blob.channel(q * 4 + 2);
+                    float* ptr3 = top_blob.channel(q * 4 + 3);
+
+                    vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4) * packn, vl);
+                    vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl);
+                    vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl);
+                    vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
+                        vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
+                        vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
+                        _v0 = vfmul_vv_f32m4(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m4(_v1, _scale1, vl);
+                        _v2 = vfmul_vv_f32m4(_v2, _scale2, vl);
+                        _v3 = vfmul_vv_f32m4(_v3, _scale3, vl);
+                        vse32_v_f32m4(ptr0, _v0, vl);
+                        vse32_v_f32m4(ptr1, _v1, vl);
+                        vse32_v_f32m4(ptr2, _v2, vl);
+                        vse32_v_f32m4(ptr3, _v3, vl);
+
+                        intptr += in_packn;
+                        ptr0 += packn;
+                        ptr1 += packn;
+                        ptr2 += packn;
+                        ptr3 += packn;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 4);
+                    float* ptr1 = top_blob.channel(q * 4 + 1);
+                    float* ptr2 = top_blob.channel(q * 4 + 2);
+                    float* ptr3 = top_blob.channel(q * 4 + 3);
+
+                    vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4) * packn, vl);
+                    vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl);
+                    vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl);
+                    vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl);
+                    vfloat32m4_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4) * packn, vl);
+                    vfloat32m4_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 1) * packn, vl);
+                    vfloat32m4_t _bias2 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 2) * packn, vl);
+                    vfloat32m4_t _bias3 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 3) * packn, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                        vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
+                        vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
+                        vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
+                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
+                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
+                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        vse32_v_f32m4(ptr0, _v0, vl);
+                        vse32_v_f32m4(ptr1, _v1, vl);
+                        vse32_v_f32m4(ptr2, _v2, vl);
+                        vse32_v_f32m4(ptr3, _v3, vl);
+
+                        intptr += in_packn;
+                        ptr0 += packn;
+                        ptr1 += packn;
+                        ptr2 += packn;
+                        ptr3 += packn;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        float* ptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale;
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias_data[i];
+                }
+            }
+        }
+        else
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i];
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+                int j = 0;
+#if __riscv_vector
+                for (; j + packn < w; j += packn)
+                {
+                    vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                    _v = vfmul_vf_f32m4(_v, scale, vl);
+                    vse32_v_f32m4(ptr, _v, vl);
+
+                    intptr += packn;
+                    ptr += packn;
+                }
+#endif // __riscv_vector
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                int j = 0;
+#if __riscv_vector
+                vfloat32m4_t _bias = vfmv_v_f_f32m4(bias, vl);
+                for (; j + packn < w; j += packn)
+                {
+                    vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                    _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
+                    vse32_v_f32m4(ptr, _v, vl);
+
+                    intptr += packn;
+                    ptr += packn;
+                }
+#endif // __riscv_vector
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                for (; i + packn < size; i += packn)
+                {
+                    vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                    _v = vfmul_vf_f32m4(_v, scale, vl);
+                    vse32_v_f32m4(ptr, _v, vl);
+
+                    intptr += packn;
+                    ptr += packn;
+                }
+#endif // __riscv_vector
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                vfloat32m4_t _bias = vfmv_v_f_f32m4(bias, vl);
+                for (; i + packn < size; i += packn)
+                {
+                    vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
+                    _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
+                    vse32_v_f32m4(ptr, _v, vl);
+
+                    intptr += packn;
+                    ptr += packn;
+                }
+#endif // __riscv_vector
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h
new file mode 100644
index 000000000000..f91feb9cb143
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_RISCV_H
+#define LAYER_DEQUANTIZE_RISCV_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_riscv : virtual public Dequantize
+{
+public:
+    Dequantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_RISCV_H

From 4936e808e99426d3cbb55b542a1de5246b8b1c5e Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Thu, 15 Feb 2024 18:45:45 +0800
Subject: [PATCH 08/10] fix riscv requantize/dequantize packing

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/layer/riscv/dequantize_riscv.cpp | 24 ++++++++++++------------
 src/layer/riscv/quantize_riscv.cpp   |  2 --
 src/layer/riscv/requantize_riscv.cpp | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
index 9a172a414052..27662b4f98e1 100644
--- a/src/layer/riscv/dequantize_riscv.cpp
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -83,7 +83,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         float* ptr = (float*)top_blob + i * packn;
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -97,7 +97,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vf_f32m4(_v, scale, _bias, vl);
+                        _v = vfmacc_vf_f32m4(_bias, scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -130,7 +130,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);       
+                        _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl);       
                         vse32_v_f32m4(ptr, _v, vl);                 
                     }
                 }
@@ -145,7 +145,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl);
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl);
+                        _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl);
                         vse32_v_f32m4(ptr, _v, vl);
                     }
                 }
@@ -227,10 +227,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
                         vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
                         vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
-                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
-                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
-                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
-                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl);
+                        _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl);
+                        _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl);
+                        _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl);
                         vse32_v_f32m4(ptr0, _v0, vl);
                         vse32_v_f32m4(ptr1, _v1, vl);
                         vse32_v_f32m4(ptr2, _v2, vl);
@@ -323,10 +323,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl);
                         vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl);
                         vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl);
-                        _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl);
-                        _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl);
-                        _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl);
-                        _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl);
+                        _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl);
+                        _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl);
+                        _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl);
+                        _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl);
                         vse32_v_f32m4(ptr0, _v0, vl);
                         vse32_v_f32m4(ptr1, _v1, vl);
                         vse32_v_f32m4(ptr2, _v2, vl);
diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index 172c7d45ab3d..ed2a331dfd9a 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -231,8 +231,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
             int size = w * h;
             int out_elempack = opt.use_packing_layout && channels * elempack % out_packn == 0 ? out_packn : 1;
             int outc = channels * elempack / out_elempack;
-            NCNN_LOGE("out_elempack:%d", out_elempack);
-            NCNN_LOGE("outc:%d", outc);
 
             top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
             if (top_blob.empty())
diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
index 220087691d54..f2d3db2d1ff7 100644
--- a/src/layer/riscv/requantize_riscv.cpp
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -83,7 +83,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         signed char* ptr = (signed char*)top_blob + i * packn;
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -98,7 +98,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -137,7 +137,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -154,7 +154,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -193,7 +193,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -210,7 +210,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -249,7 +249,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -267,7 +267,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -322,7 +322,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                     for (int j = 0; j < w; j++)
                     {
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
 
@@ -394,7 +394,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                     for (int i = 0; i < size; i++)
                     {
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl);
+                        _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
 

From 5dde273febd36c8894ecdefa0324051b8aab9f89 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Sat, 17 Feb 2024 15:02:16 +0800
Subject: [PATCH 09/10] riscv convdw int8 packing rvv

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 .../riscv/convolutiondepthwise_riscv.cpp      | 388 +++++++++++++++++-
 src/layer/riscv/convolutiondepthwise_riscv.h  |   7 +
 2 files changed, 383 insertions(+), 12 deletions(-)

diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index d913fe7e1d59..d35121dc3c66 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -60,8 +60,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        // TODO implement int8
-        return 0;
+        return create_pipeline_int8_riscv(opt);
     }
 #endif
 
@@ -238,14 +237,11 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        int packn = 1;
+#if __riscv_vector
+        packn = csrr_vlenb();
+#endif
         Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
 
         Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
         if (bottom_blob_unpacked.elembits() == 16)
@@ -253,12 +249,15 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
             Option opt_pack1 = opt;
             opt_pack1.blob_allocator = opt.workspace_allocator;
 
+            if (!opt.use_packing_layout || packn == 1 || bottom_blob.elempack == 1)
+                convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+            else
+                convert_packing(bottom_blob, bottom_blob_unpacked, packn / 4, opt_pack1);
+
             cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
         }
 
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt);
     }
 #endif
 
@@ -1153,4 +1152,369 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int ConvolutionDepthWise_riscv::create_pipeline_int8_riscv(const Option& opt)
+{
+#if __riscv_vector
+    const int packn = csrr_vlenb();
+    size_t vl = vsetvl_e8m1(packn);
+#endif
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % packn == 0 ? packn : 1;
+        }
+
+        if (elempack == packn)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, packn, opt);
+        }
+#endif // __riscv_vector
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        scale_in_data.create(group);
+        for (int g = 0; g < group; g++)
+        {
+            float scale_in;
+            if (weight_data_int8_scales[g] == 0)
+                scale_in = 0;
+            else
+                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+            scale_in_data[g] = scale_in;
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    Option opt_unpack = opt;
+    opt_unpack.use_packing_layout = false;
+    create_group_ops(opt_unpack);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    const int packn = csrr_vlenb();
+    size_t vl = vsetvl_e8m1(packn);
+#endif
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+
+    int elembits = bottom_blob.elembits();
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        const int channels_g = channels * elempack / group;
+
+        Mat scales(channels * elempack);
+        {
+            float* ps = scales;
+            for (int g = 0; g < group; g++)
+            {
+                float scale = bottom_blob_int8_scales[g];
+                for (int q = 0; q < channels_g; q++)
+                {
+                    *ps++ = scale;
+                }
+            }
+        }
+
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+    channels = bottom_blob_bordered.c;
+    elempack = bottom_blob_bordered.elempack;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = num_output % packn == 0 ? packn : 1;
+        }
+#endif // __riscv_vector
+        bool use_int8_requantize = int8_scale_term > 100;
+        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __riscv_vector
+        if (elempack == packn)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * packn;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            vint32m4_t _sum = vmv_v_x_i32m4(0, vl);
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * packn;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(sptr + space_ofs[k] * packn, vl), vl);
+                                vint16m2_t _w = vwcvt_x_x_v_i16m2(vle8_v_i8m1(kptr + k * packn, vl), vl);
+
+                                _sum = vwmacc_vv_i32m4(_sum, _val, _w, vl);
+                            }
+
+                            vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + g * packn, vl);
+                            vfloat32m4_t _sumfp32 = bias_term ? vle32_v_f32m4((const float*)bias_data + g * packn, vl) 
+                                        : vfmv_v_f_f32m4(0.f, vl);
+
+                            _sumfp32 = vfmacc_vv_f32m4(_sumfp32, _scale_in, vfcvt_f_x_v_f32m4(_sum, vl), vl);
+                            _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize and relu
+                                vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)top_blob_int8_scales + g * packn, vl);
+                                _sumfp32 = vfmul_vv_f32m4(_sumfp32, _scale_out, vl);
+                                vint8m1_t _sum8 = float2int8(_sumfp32, vl);
+
+                                vse8_v_i8m1(outptr_s8, _sum8, vl);
+                                outptr_s8 += packn;
+                            }
+                            else
+                            {
+                                // dequantize and relu
+                                vse32_v_f32m4(outptr_f32, _sumfp32, vl);
+                                outptr_f32 += packn;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif // __riscv_vector
+
+        if (elempack == 1)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sum = 0;
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                signed char val = sptr[space_ofs[k]];
+                                signed char w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            float sumfp32 = sum * scale_in_data[g];
+
+                            if (bias_term)
+                                sumfp32 += bias_data[g];
+
+                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize
+                                float scale_out = top_blob_int8_scales[g];
+                                signed char sums8 = float2int8(sumfp32 * scale_out);
+                                outptr_s8[0] = sums8;
+                                outptr_s8 += 1;
+                            }
+                            else
+                            {
+                                // dequantize
+                                outptr_f32[0] = sumfp32;
+                                outptr_f32 += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if 0
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % packn == 0 ? packn : 1;
+        else
+            out_elempack = num_output % (packn / 4) == 0 ? (packn / 4) : 1;
+    }
+#endif // __riscv_vector
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if 0
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % packn == 0 ? packn : 1;
+        if (use_int8_requantize)
+            out_g_elempack = num_output_g % packn == 0 ? packn : 1;
+        else
+            out_g_elempack = num_output_g % (packn / 4) == 0 ? (packn / 4) : 1;
+    }
+#endif // __riscv_vector
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
 } // namespace ncnn
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index f9503975296d..944a0ac58727 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -38,6 +38,10 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
+#if NCNN_INT8
+    int create_pipeline_int8_riscv(const Option& opt);
+    int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
 
 public:
     Layer* activation;
@@ -47,6 +51,9 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
 
     // fp16
     Mat bias_data_fp16;
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn

From 830052acc726e0058f65d5dc1b39db5b52a7d11d Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Fri, 1 Mar 2024 15:44:12 +0800
Subject: [PATCH 10/10] fixup! riscv requantize packing

---
 src/layer/riscv/requantize_riscv.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
index f2d3db2d1ff7..833756e248a3 100644
--- a/src/layer/riscv/requantize_riscv.cpp
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -83,7 +83,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         signed char* ptr = (signed char*)top_blob + i * packn;
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        _v = vfmul_vf_f32m4(_v, scale_in, vl);
+                        _v = vfadd_vv_f32m4(_v, _bias, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -98,7 +101,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        _v = vfmul_vf_f32m4(_v, scale_in, vl);
+                        _v = vfadd_vv_f32m4(_v, _bias, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -137,7 +143,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        _v = vfmul_vf_f32m4(_v, scale_in, vl);
+                        _v = vfadd_vv_f32m4(_v, _bias, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }
@@ -154,7 +163,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
                         vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl);
 
                         vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl);
-                        _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl);
+                        _v = vfmul_vf_f32m4(_v, scale_in, vl);
+                        _v = vfadd_vv_f32m4(_v, _bias, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
                         vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl);
                         vse8_v_i8m1(ptr, _out, vl);
                     }