From d0cd52370633224a37b8b6b298ce9e8a8ff4790c Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 8 Dec 2023 13:50:13 +0800 Subject: [PATCH 01/10] riscv int8 convolution Signed-off-by: Molly Sophia --- .../riscv/convolution_3x3_winograd_int8.h | 1620 +++++++++++++++++ .../riscv/convolution_im2col_gemm_int8.h | 953 ++++++++++ src/layer/riscv/convolution_packed_int8.h | 398 ++++ src/layer/riscv/convolution_riscv.cpp | 145 +- src/layer/riscv/convolution_riscv.h | 8 + 5 files changed, 3121 insertions(+), 3 deletions(-) create mode 100644 src/layer/riscv/convolution_3x3_winograd_int8.h create mode 100644 src/layer/riscv/convolution_im2col_gemm_int8.h create mode 100644 src/layer/riscv/convolution_packed_int8.h diff --git a/src/layer/riscv/convolution_3x3_winograd_int8.h b/src/layer/riscv/convolution_3x3_winograd_int8.h new file mode 100644 index 000000000000..9dda146b1338 --- /dev/null +++ b/src/layer/riscv/convolution_3x3_winograd_int8.h @@ -0,0 +1,1620 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk) +{ + const int N = max_kk * batch; + + for (int b = 0; b < batch; b++) + { + short* pp = AT.row(b); + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + const short* p0 = (const short*)A + ii * N + b; + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + pp[0] = p0[0]; + pp[1] = p0[batch]; + pp[2] = p0[N]; + pp[3] = p0[N + batch]; + p0 += batch * 2; + pp += 4; + } + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp[1] = p0[N]; + p0 += batch; + pp += 2; + } + } + for (; ii < max_ii; ii++) + { + const short* p0 = (const short*)A + ii * N + b; + + int kk = 0; + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + p0 += batch; + pp += 1; + } + } + } +} + +static void transpose_pack_B_tile_int8(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT) +{ + #pragma omp parallel for num_threads(nT) + for (int b = 0; b < batch; b++) + { + short* pp = BT.row(b); + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + const short* p0 = B; + + int kk = 0; + p0 += (b * max_jj + jj) * 2; + for (; kk + 1 < max_kk; kk += 2) + { + pp[0] = p0[0]; + pp[1] = p0[1]; + pp[2] = p0[2]; + pp[3] = p0[3]; + p0 += max_jj * batch * 2; + pp += 4; + } + p0 -= (b * max_jj + jj) * 2; + p0 += (b * max_jj + jj); + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp[1] = p0[1]; + p0 += max_jj * batch; + pp += 2; + } + } + for (; jj < max_jj; jj++) + { + const short* p0 = B; + + int kk = 0; + p0 += (b * max_jj + jj) * 2; + for (; kk + 1 < max_kk; kk += 2) + { + pp[0] = p0[0]; + pp[1] = p0[1]; + p0 += max_jj * batch * 2; + pp += 2; + } + p0 -= (b * max_jj + jj) * 2; + p0 += (b * max_jj + jj); + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + p0 += max_jj * batch; + pp += 1; + } + } + } +} + +static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk, bool k_end) +{ + int* outptr = top_blob; + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + for (int b = 0; b < batch; b++) + { + const short* pAT = AT_tile.row(b) + max_kk * ii; + const short* pB = BT_tile.row(b); + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + const short* pA = pAT; + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + if (k == 0) + { + sum00 = 0; + sum01 = 0; + sum10 = 0; + sum11 = 0; + } + else + { + sum00 = outptr[0]; + sum01 = outptr[1]; + sum10 = outptr[2]; + sum11 = outptr[3]; + } + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum00 += pA[0] * pB[0]; + sum00 += pA[1] * pB[1]; + sum01 += pA[2] * pB[0]; + sum01 += pA[3] * pB[1]; + sum10 += pA[0] * pB[2]; + sum10 += pA[1] * pB[3]; + sum11 += pA[2] * pB[2]; + sum11 += pA[3] * pB[3]; + + pA += 4; + pB += 4; + } + for (; kk < max_kk; kk++) + { + sum00 += pA[0] * pB[0]; + sum01 += pA[1] * pB[0]; + sum10 += pA[0] * pB[1]; + sum11 += pA[1] * pB[1]; + pA += 2; + pB += 2; + } + + outptr[0] = sum00; + outptr[1] = sum01; + outptr[2] = sum10; + outptr[3] = sum11; + outptr += 2 * 2; + } + for (; jj < max_jj; jj++) + { + const short* pA = pAT; + + int sum0 = 0; + int sum1 = 0; + + if (k == 0) + { + sum0 = 0; + sum1 = 0; + } + else + { + sum0 = outptr[0]; + sum1 = outptr[1]; + } + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum0 += pA[0] * pB[0]; + sum0 += pA[1] * pB[1]; + sum1 += pA[2] * pB[0]; + sum1 += pA[3] * pB[1]; + pA += 4; + pB += 2; + } + for (; kk < max_kk; kk++) + { + sum0 += pA[0] * pB[0]; + sum1 += pA[1] * pB[0]; + pA += 2; + pB += 1; + } + + outptr[0] = sum0; + outptr[1] = sum1; + outptr += 2; + } + } + } + for (; ii < max_ii; ii++) + { + for (int b = 0; b < batch; b++) + { + const short* pAT = AT_tile.row(b) + max_kk * ii; + const short* pB = BT_tile.row(b); + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + const short* pA = pAT; + + int sum0 = 0; + int sum1 = 0; + + if (k == 0) + { + sum0 = 0; + sum1 = 0; + } + else + { + sum0 = outptr[0]; + sum1 = outptr[1]; + } + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum0 += pA[0] * pB[0]; + sum0 += pA[1] * pB[1]; + sum1 += pA[0] * pB[2]; + sum1 += pA[1] * pB[3]; + pA += 2; + pB += 4; + } + for (; kk < max_kk; kk++) + { + sum0 += pA[0] * pB[0]; + sum1 += pA[0] * pB[1]; + pA += 1; + pB += 2; + } + + outptr[0] = sum0; + outptr[1] = sum1; + outptr += 2; + } + for (; jj < max_jj; jj++) + { + const short* pA = pAT; + + int sum = 0; + + if (k == 0) + { + sum = 0; + } + else + { + sum = outptr[0]; + } + + int kk = 0; + for (; kk < max_kk; kk++) + { + sum += pA[0] * pB[0]; + pA += 1; + pB += 1; + } + + outptr[0] = sum; + outptr += 1; + } + } + } +} + +static void get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) +{ + // resolve optimal tile size from cache size + const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(short)); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + + // solve M + { + int tile_size = (int)sqrt((float)l2_cache_size_int8 / 3); + + TILE_M = std::max(2, tile_size / 2 * 2); + + TILE_M *= std::min(nT, get_physical_cpu_count()); + + int nn_M = (M + TILE_M - 1) / TILE_M; + + TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2); + + if (nT > 1) + { + TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2); + } + } + + // solve K + { + int tile_size = (int)(sqrt((float)l2_cache_size_int8) - TILE_M); + + TILE_K = std::max(2, tile_size / 2 * 2); + + int nn_K = (K + TILE_K - 1) / TILE_K; + + TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2); + } + + if (N > 0) + { + int tile_size = (int)((l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 2 + TILE_K)); + + TILE_N = std::max(1, tile_size); + + int nn_N = (N + TILE_N - 1) / TILE_N; + + TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N); + } +} + +static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) +{ + // const signed char ktm[4][3] = { + // {2, 0, 0}, + // {1, 1, 1}, + // {1, -1, 1}, + // {0, 0, 2} + // }; + + short* ptmp = A; + + int ii = 0; + for (; ii < max_ii; ii++) + { + int kk = 0; + for (; kk < max_kk; kk++) + { + short tmp[4][3]; + + const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9; + + for (int m = 0; m < 3; m++) + { + signed char r0 = k0[0]; + signed char r1 = k0[1]; + signed char r2 = k0[2]; + + tmp[0][m] = r0 * 2; + tmp[1][m] = r0 + r1 + r2; + tmp[2][m] = r0 - r1 + r2; + tmp[3][m] = r2 * 2; + + k0 += 3; + } + + for (int m = 0; m < 4; m++) + { + short r0 = tmp[m][0]; + short r1 = tmp[m][1]; + short r2 = tmp[m][2]; + + short z0 = r0 * 2; + short z1 = r0 + r1 + r2; + short z2 = r0 - r1 + r2; + short z3 = r2 * 2; + + ptmp[0] = z0; + ptmp[1] = z1; + ptmp[2] = z2; + ptmp[3] = z3; + ptmp += 4; + } + } + } +} + +static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt) +{ + const int M = outch; + const int K = inch; + const int B = 16; + + int TILE_M, TILE_N, TILE_K; + get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + + Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0); + + AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + Mat A_tile = A_tileX.channel(get_omp_thread_num()); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_ii = std::min((M - i), TILE_M); + const int max_kk = std::min((K - k), TILE_K); + + conv3x3s1_winograd23_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk); + + Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K); + + pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk); + } + } +} + +static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) +{ + // const signed char itm[4][4] = { + // {1, 0, -1, 0}, + // {0, 1, 1, 0}, + // {0, -1, 1, 0}, + // {0, -1, 0, 1} + // }; + + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int elempack = bottom_blob.elempack; + const int N = bottom_blob.cstep * elempack; + + const int w_tiles = (w - 1) / 2; + + int nn_max_kk = 0; + int remain_max_kk_start = 0; + nn_max_kk = (max_kk - remain_max_kk_start) / 2; + #pragma omp parallel for num_threads(nT) + for (int ppkk = 0; ppkk < nn_max_kk; ppkk++) + { + const int kk = remain_max_kk_start + ppkk * 2; + + short tmp[4][4][2]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const signed char* r0 = bottom_blob.channel(k + kk).row(ti * 2) + (tj * 2); + + for (int m = 0; m < 4; m++) + { + signed char r00 = 0; + signed char r01 = 0; + signed char r10 = 0; + signed char r11 = 0; + signed char r20 = 0; + signed char r21 = 0; + signed char r30 = 0; + signed char r31 = 0; + + if (ti * 2 + m < h) + { + // if (elempack == 1) + { + const signed char* r1 = r0 + N; + + r00 = r0[0]; + r01 = r1[0]; + if (tj * 2 + 1 < w) + { + r10 = r0[1]; + r11 = r1[1]; + } + if (tj * 2 + 2 < w) + { + r20 = r0[2]; + r21 = r1[2]; + } + if (tj * 2 + 3 < w) + { + r30 = r0[3]; + r31 = r1[3]; + } + } + } + + tmp[0][m][0] = r00 - r20; + tmp[0][m][1] = r01 - r21; + tmp[1][m][0] = r10 + r20; + tmp[1][m][1] = r11 + r21; + tmp[2][m][0] = r20 - r10; + tmp[2][m][1] = r21 - r11; + tmp[3][m][0] = r30 - r10; + tmp[3][m][1] = r31 - r11; + + r0 += w; + } + + short* p0 = (short*)B + kk * max_jj * 16 + jj * 2; + short* p1 = p0 + max_jj * 2; + short* p2 = p0 + max_jj * 2 * 2; + short* p3 = p0 + max_jj * 2 * 3; + + for (int m = 0; m < 4; m++) + { + short r00 = tmp[m][0][0]; + short r01 = tmp[m][0][1]; + short r10 = tmp[m][1][0]; + short r11 = tmp[m][1][1]; + short r20 = tmp[m][2][0]; + short r21 = tmp[m][2][1]; + short r30 = tmp[m][3][0]; + short r31 = tmp[m][3][1]; + + p0[0] = r00 - r20; + p0[1] = r01 - r21; + p1[0] = r10 + r20; + p1[1] = r11 + r21; + p2[0] = r20 - r10; + p2[1] = r21 - r11; + p3[0] = r30 - r10; + p3[1] = r31 - r11; + + p0 += max_jj * 4 * 2; + p1 += max_jj * 4 * 2; + p2 += max_jj * 4 * 2; + p3 += max_jj * 4 * 2; + } + } + } + remain_max_kk_start += nn_max_kk * 2; + for (int kk = remain_max_kk_start; kk < max_kk; kk++) + { + short tmp[4][4]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const signed char* r0123 = bottom_blob.channel(k + kk).row(ti * 2) + (tj * 2); + + for (int m = 0; m < 4; m++) + { + signed char r0 = 0; + signed char r1 = 0; + signed char r2 = 0; + signed char r3 = 0; + + if (ti * 2 + m < h) + { + // if (elempack == 1) + { + r0 = r0123[0]; + if (tj * 2 + 1 < w) r1 = r0123[1]; + if (tj * 2 + 2 < w) r2 = r0123[2]; + if (tj * 2 + 3 < w) r3 = r0123[3]; + } + } + + tmp[0][m] = r0 - r2; + tmp[1][m] = r1 + r2; + tmp[2][m] = r2 - r1; + tmp[3][m] = r3 - r1; + + r0123 += w; + } + + short* p0 = (short*)B + kk * max_jj * 16 + jj; + short* p1 = p0 + max_jj; + short* p2 = p0 + max_jj * 2; + short* p3 = p0 + max_jj * 3; + + for (int m = 0; m < 4; m++) + { + short r0 = tmp[m][0]; + short r1 = tmp[m][1]; + short r2 = tmp[m][2]; + short r3 = tmp[m][3]; + + p0[0] = r0 - r2; + p1[0] = r1 + r2; + p2[0] = r2 - r1; + p3[0] = r3 - r1; + + p0 += max_jj * 4; + p1 += max_jj * 4; + p2 += max_jj * 4; + p3 += max_jj * 4; + } + } + } +} + +static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj) +{ + // const int otm[2][4] = { + // {1, 1, 1, 0}, + // {0, 1, -1, 1} + // }; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int N = top_blob.cstep * out_elempack; + + const int w_tiles = (outw + 1) / 2; + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + int tmp[2][4][2]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 2; + const int* r1 = r0 + max_jj * 2; + const int* r2 = r0 + max_jj * 2 * 2; + const int* r3 = r0 + max_jj * 2 * 3; + + for (int m = 0; m < 4; m++) + { + tmp[0][m][0] = r0[0] + r1[0] + r2[0]; + tmp[0][m][1] = r0[1] + r1[1] + r2[1]; + tmp[1][m][0] = r1[0] - r2[0] + r3[0]; + tmp[1][m][1] = r1[1] - r2[1] + r3[1]; + + r0 += max_jj * 4 * 2; + r1 += max_jj * 4 * 2; + r2 += max_jj * 4 * 2; + r3 += max_jj * 4 * 2; + } + + int* outptr0 = top_blob.channel(i + ii).row(ti * 2) + (tj * 2); + + for (int m = 0; m < 2; m++) + { + if (ti * 2 + m >= outh) + continue; + + int tmp00 = tmp[m][0][0] + tmp[m][1][0] + tmp[m][2][0]; + int tmp01 = tmp[m][0][1] + tmp[m][1][1] + tmp[m][2][1]; + int tmp10 = tmp[m][1][0] - tmp[m][2][0] + tmp[m][3][0]; + int tmp11 = tmp[m][1][1] - tmp[m][2][1] + tmp[m][3][1]; + + tmp00 = tmp00 >> 2; + tmp01 = tmp01 >> 2; + tmp10 = tmp10 >> 2; + tmp11 = tmp11 >> 2; + + // if (out_elempack == 1) + { + int* outptr1 = outptr0 + N; + + outptr0[0] = tmp00; + outptr1[0] = tmp01; + if (tj * 2 + 1 < outw) + { + outptr0[1] = tmp10; + outptr1[1] = tmp11; + } + } + + outptr0 += outw; + } + } + } + for (; ii < max_ii; ii++) + { + int tmp[2][4]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj; + const int* r1 = r0 + max_jj; + const int* r2 = r0 + max_jj * 2; + const int* r3 = r0 + max_jj * 3; + + for (int m = 0; m < 4; m++) + { + tmp[0][m] = r0[0] + r1[0] + r2[0]; + tmp[1][m] = r1[0] - r2[0] + r3[0]; + + r0 += max_jj * 4; + r1 += max_jj * 4; + r2 += max_jj * 4; + r3 += max_jj * 4; + } + + int* outptr0 = top_blob.channel(i + ii).row(ti * 2) + (tj * 2); + + for (int m = 0; m < 2; m++) + { + if (ti * 2 + m >= outh) + continue; + + int tmp0 = tmp[m][0] + tmp[m][1] + tmp[m][2]; + int tmp1 = tmp[m][1] - tmp[m][2] + tmp[m][3]; + + tmp0 = tmp0 >> 2; + tmp1 = tmp1 >> 2; + + // if (out_elempack == 1) + { + outptr0[0] = tmp0; + if (tj * 2 + 1 < outw) outptr0[1] = tmp1; + } + + outptr0 += outw; + } + } + } +} + +static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) +{ + int outw = top_blob.w; + int outh = top_blob.h; + + // pad to 2n+2, winograd F(2,3) + int w_tiles = (outw + 1) / 2; + int h_tiles = (outh + 1) / 2; + int tiles = w_tiles * h_tiles; + + const int M = top_blob.c * top_blob.elempack; + const int N = tiles; + const int K = bottom_blob.c * bottom_blob.elempack; + const int B = 16; + + // NCNN_LOGE("conv3x3s1_winograd23_int8 %d %d %d", M, N, K); + + int TILE_M, TILE_N, TILE_K; + get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + const int nn_N = (N + TILE_N - 1) / TILE_N; + const int nn_K = (K + TILE_K - 1) / TILE_K; + + // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); + + Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + + const int nn_NK = nn_N * nn_K; + + if (nT > 1 && nn_NK < nT) + { + Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + + for (int ppjk = 0; ppjk < nn_NK; ppjk++) + { + const int ppj = ppjk / nn_K; + const int ppk = ppjk % nn_K; + + const int j = ppj * TILE_N; + const int k = ppk * TILE_K; + + const int max_jj = std::min((N - j), TILE_N); + const int max_kk = std::min((K - k), TILE_K); + + // transform input + conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT); + + Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT); + } + } + else + { + Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(nT) + for (int ppjk = 0; ppjk < nn_NK; ppjk++) + { + const int ppj = ppjk / nn_K; + const int ppk = ppjk % nn_K; + + const int j = ppj * TILE_N; + const int k = ppk * TILE_K; + + const int max_jj = std::min((N - j), TILE_N); + const int max_kk = std::min((K - k), TILE_K); + + Mat B_tile = B_tileX.channel(get_omp_thread_num()); + + // transform input + conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1); + + Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1); + } + } + + Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(nT) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + Mat top_tile = top_tileX.channel(get_omp_thread_num()); + + const int max_ii = std::min((M - i), TILE_M); + + for (int j = 0; j < N; j += TILE_N) + { + const int max_jj = std::min((N - j), TILE_N); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_kk = std::min((K - k), TILE_K); + + const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K); + + const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + bool k_end = k + TILE_K >= K; + + gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end); + } + + // transform output + conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); + } + } +} + +static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) +{ + // const short ktm[6][3] = { + // {6, 0, 0}, + // {-4, -4, -4}, + // {-4, 4, -4}, + // {1, 2, 4}, + // {1, -2, 4}, + // {0, 0, 6} + // }; + + short* ptmp = A; + + int ii = 0; + for (; ii < max_ii; ii++) + { + int kk = 0; + for (; kk < max_kk; kk++) + { + short tmp[6][3]; + + const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9; + + for (int m = 0; m < 3; m++) + { + signed char r0 = k0[0]; + signed char r1 = k0[1]; + signed char r2 = k0[2]; + + tmp[0][m] = r0 * 6; + tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4; + tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4; + tmp[3][m] = r0 + r1 * 2 + r2 * 4; + tmp[4][m] = r0 - r1 * 2 + r2 * 4; + tmp[5][m] = r2 * 6; + + k0 += 3; + } + + for (int m = 0; m < 6; m++) + { + short r0 = tmp[m][0]; + short r1 = tmp[m][1]; + short r2 = tmp[m][2]; + + short z0 = r0 * 6; + short z1 = -r0 * 4 - r1 * 4 - r2 * 4; + short z2 = -r0 * 4 + r1 * 4 - r2 * 4; + short z3 = r0 + r1 * 2 + r2 * 4; + short z4 = r0 - r1 * 2 + r2 * 4; + short z5 = r2 * 6; + + ptmp[0] = z0; + ptmp[1] = z1; + ptmp[2] = z2; + ptmp[3] = z3; + ptmp[4] = z4; + ptmp[5] = z5; + ptmp += 6; + } + } + } +} + +static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt) +{ + const int M = outch; + const int K = inch; + const int B = 36; + + int TILE_M, TILE_N, TILE_K; + get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + + Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 4u, (Allocator*)0); + + AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + Mat A_tile = A_tileX.channel(get_omp_thread_num()); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_ii = std::min((M - i), TILE_M); + const int max_kk = std::min((K - k), TILE_K); + + conv3x3s1_winograd43_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk); + + Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K); + + pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk); + } + } +} + +static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) +{ + // const float itm[4][4] = { + // {4, 0, -5, 0, 1, 0}, + // {0, -4, -4, 1, 1, 0}, + // {0, 4, -4, -1, 1, 0}, + // {0, -2, -1, 2, 1, 0}, + // {0, 2, -1, -2, 1, 0}, + // {0, 4, 0, -5, 0, 1} + // }; + + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int elempack = bottom_blob.elempack; + const int N = bottom_blob.cstep * elempack; + + const int w_tiles = (w + 1) / 4; + + int nn_max_kk = 0; + int remain_max_kk_start = 0; + nn_max_kk = (max_kk - remain_max_kk_start) / 2; + #pragma omp parallel for num_threads(nT) + for (int ppkk = 0; ppkk < nn_max_kk; ppkk++) + { + const int kk = remain_max_kk_start + ppkk * 2; + + short tmp[6][6][2]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const signed char* r0 = bottom_blob.channel(k + kk).row(ti * 4) + (tj * 4); + + for (int m = 0; m < 6; m++) + { + signed char r00 = 0; + signed char r01 = 0; + signed char r10 = 0; + signed char r11 = 0; + signed char r20 = 0; + signed char r21 = 0; + signed char r30 = 0; + signed char r31 = 0; + signed char r40 = 0; + signed char r41 = 0; + signed char r50 = 0; + signed char r51 = 0; + + if (ti * 4 + m < h) + { + // if (elempack == 1) + { + const signed char* r1 = r0 + N; + + r00 = r0[0]; + r01 = r1[0]; + if (tj * 4 + 1 < w) + { + r10 = r0[1]; + r11 = r1[1]; + } + if (tj * 4 + 2 < w) + { + r20 = r0[2]; + r21 = r1[2]; + } + if (tj * 4 + 3 < w) + { + r30 = r0[3]; + r31 = r1[3]; + } + if (tj * 4 + 4 < w) + { + r40 = r0[4]; + r41 = r1[4]; + } + if (tj * 4 + 5 < w) + { + r50 = r0[5]; + r51 = r1[5]; + } + } + } + + short tmp120a = r30 - r10 * 4; + short tmp121a = r31 - r11 * 4; + short tmp120b = r40 - r20 * 4; + short tmp121b = r41 - r21 * 4; + short tmp340a = (r30 - r10) * 2; + short tmp341a = (r31 - r11) * 2; + short tmp340b = r40 - r20; + short tmp341b = r41 - r21; + + tmp[0][m][0] = r40 + r00 * 4 - r20 * 5; + tmp[0][m][1] = r41 + r01 * 4 - r21 * 5; + tmp[1][m][0] = tmp120b + tmp120a; + tmp[1][m][1] = tmp121b + tmp121a; + tmp[2][m][0] = tmp120b - tmp120a; + tmp[2][m][1] = tmp121b - tmp121a; + tmp[3][m][0] = tmp340b + tmp340a; + tmp[3][m][1] = tmp341b + tmp341a; + tmp[4][m][0] = tmp340b - tmp340a; + tmp[4][m][1] = tmp341b - tmp341a; + tmp[5][m][0] = r50 + r10 * 4 - r30 * 5; + tmp[5][m][1] = r51 + r11 * 4 - r31 * 5; + + r0 += w; + } + + short* p0 = (short*)B + kk * max_jj * 36 + jj * 2; + short* p1 = p0 + max_jj * 2; + short* p2 = p0 + max_jj * 2 * 2; + short* p3 = p0 + max_jj * 2 * 3; + short* p4 = p0 + max_jj * 2 * 4; + short* p5 = p0 + max_jj * 2 * 5; + + for (int m = 0; m < 6; m++) + { + short r00 = tmp[m][0][0]; + short r01 = tmp[m][0][1]; + short r10 = tmp[m][1][0]; + short r11 = tmp[m][1][1]; + short r20 = tmp[m][2][0]; + short r21 = tmp[m][2][1]; + short r30 = tmp[m][3][0]; + short r31 = tmp[m][3][1]; + short r40 = tmp[m][4][0]; + short r41 = tmp[m][4][1]; + short r50 = tmp[m][5][0]; + short r51 = tmp[m][5][1]; + + short tmp120a = r30 - r10 * 4; + short tmp121a = r31 - r11 * 4; + short tmp120b = r40 - r20 * 4; + short tmp121b = r41 - r21 * 4; + short tmp340a = (r30 - r10) * 2; + short tmp341a = (r31 - r11) * 2; + short tmp340b = r40 - r20; + short tmp341b = r41 - r21; + + p0[0] = r40 + r00 * 4 - r20 * 5; + p0[1] = r41 + r01 * 4 - r21 * 5; + p1[0] = tmp120b + tmp120a; + p1[1] = tmp121b + tmp121a; + p2[0] = tmp120b - tmp120a; + p2[1] = tmp121b - tmp121a; + p3[0] = tmp340b + tmp340a; + p3[1] = tmp341b + tmp341a; + p4[0] = tmp340b - tmp340a; + p4[1] = tmp341b - tmp341a; + p5[0] = r50 + r10 * 4 - r30 * 5; + p5[1] = r51 + r11 * 4 - r31 * 5; + + p0 += max_jj * 6 * 2; + p1 += max_jj * 6 * 2; + p2 += max_jj * 6 * 2; + p3 += max_jj * 6 * 2; + p4 += max_jj * 6 * 2; + p5 += max_jj * 6 * 2; + } + } + } + remain_max_kk_start += nn_max_kk * 2; + for (int kk = remain_max_kk_start; kk < max_kk; kk++) + { + short tmp[6][6]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const signed char* r0123 = bottom_blob.channel(k + kk).row(ti * 4) + (tj * 4); + + for (int m = 0; m < 6; m++) + { + signed char r0 = 0; + signed char r1 = 0; + signed char r2 = 0; + signed char r3 = 0; + signed char r4 = 0; + signed char r5 = 0; + + if (ti * 4 + m < h) + { + // if (elempack == 1) + { + r0 = r0123[0]; + if (tj * 4 + 1 < w) r1 = r0123[1]; + if (tj * 4 + 2 < w) r2 = r0123[2]; + if (tj * 4 + 3 < w) r3 = r0123[3]; + if (tj * 4 + 4 < w) r4 = r0123[4]; + if (tj * 4 + 5 < w) r5 = r0123[5]; + } + } + + short tmp12a = r3 - r1 * 4; + short tmp12b = r4 - r2 * 4; + short tmp34a = (r3 - r1) * 2; + short tmp34b = r4 - r2; + + tmp[0][m] = r4 + r0 * 4 - r2 * 5; + tmp[1][m] = tmp12b + tmp12a; + tmp[2][m] = tmp12b - tmp12a; + tmp[3][m] = tmp34b + tmp34a; + tmp[4][m] = tmp34b - tmp34a; + tmp[5][m] = r5 + r1 * 4 - r3 * 5; + + r0123 += w; + } + + short* p0 = (short*)B + kk * max_jj * 36 + jj; + short* p1 = p0 + max_jj; + short* p2 = p0 + max_jj * 2; + short* p3 = p0 + max_jj * 3; + short* p4 = p0 + max_jj * 4; + short* p5 = p0 + max_jj * 5; + + for (int m = 0; m < 6; m++) + { + short r0 = tmp[m][0]; + short r1 = tmp[m][1]; + short r2 = tmp[m][2]; + short r3 = tmp[m][3]; + short r4 = tmp[m][4]; + short r5 = tmp[m][5]; + + short tmp12a = r3 - r1 * 4; + short tmp12b = r4 - r2 * 4; + short tmp34a = (r3 - r1) * 2; + short tmp34b = r4 - r2; + + p0[0] = r4 + r0 * 4 - r2 * 5; + p1[0] = tmp12b + tmp12a; + p2[0] = tmp12b - tmp12a; + p3[0] = tmp34b + tmp34a; + p4[0] = tmp34b - tmp34a; + p5[0] = r5 + r1 * 4 - r3 * 5; + + p0 += max_jj * 6; + p1 += max_jj * 6; + p2 += max_jj * 6; + p3 += max_jj * 6; + p4 += max_jj * 6; + p5 += max_jj * 6; + } + } + } +} + +static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj) +{ + // const int otm[4][6] = { + // {1, 1, 1, 1, 1, 0}, + // {0, 1, -1, 2, -2, 0}, + // {0, 1, 1, 4, 4, 0}, + // {0, 1, -1, 8, -8, 1} + // }; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int N = top_blob.cstep * out_elempack; + + const int w_tiles = (outw + 3) / 4; + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + int tmp[4][6][2]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 2; + const int* r1 = r0 + max_jj * 2; + const int* r2 = r0 + max_jj * 2 * 2; + const int* r3 = r0 + max_jj * 2 * 3; + const int* r4 = r0 + max_jj * 2 * 4; + const int* r5 = r0 + max_jj * 2 * 5; + + for (int m = 0; m < 5; m++) + { + int tmp02a0 = r1[0] + r2[0]; + int tmp02a1 = r1[1] + r2[1]; + int tmp02b0 = r3[0] + r4[0]; + int tmp02b1 = r3[1] + r4[1]; + int tmp13a0 = r1[0] - r2[0]; + int tmp13a1 = r1[1] - r2[1]; + int tmp13b0 = r3[0] - r4[0]; + int tmp13b1 = r3[1] - r4[1]; + + int tmp00 = tmp02a0 + tmp02b0 + r0[0]; + int tmp01 = tmp02a1 + tmp02b1 + r0[1]; + int tmp10 = tmp13a0 + tmp13b0 * 2; + int tmp11 = tmp13a1 + tmp13b1 * 2; + int tmp20 = tmp02a0 + tmp02b0 * 4; + int tmp21 = tmp02a1 + tmp02b1 * 4; + int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4; + int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4; + + tmp[0][m][0] = tmp00; + tmp[0][m][1] = tmp01; + tmp[1][m][0] = tmp10; + tmp[1][m][1] = tmp11; + tmp[2][m][0] = tmp20; + tmp[2][m][1] = tmp21; + tmp[3][m][0] = tmp30; + tmp[3][m][1] = tmp31; + + r0 += max_jj * 6 * 2; + r1 += max_jj * 6 * 2; + r2 += max_jj * 6 * 2; + r3 += max_jj * 6 * 2; + r4 += max_jj * 6 * 2; + r5 += max_jj * 6 * 2; + } + for (int m = 5; m < 6; m++) + { + int tmp02a0 = r1[0] + r2[0]; + int tmp02a1 = r1[1] + r2[1]; + int tmp02b0 = r3[0] + r4[0]; + int tmp02b1 = r3[1] + r4[1]; + int tmp13a0 = r1[0] - r2[0]; + int tmp13a1 = r1[1] - r2[1]; + int tmp13b0 = r3[0] - r4[0]; + int tmp13b1 = r3[1] - r4[1]; + + int tmp00 = tmp02a0 + tmp02b0 + r0[0]; + int tmp01 = tmp02a1 + tmp02b1 + r0[1]; + int tmp10 = tmp13a0 + tmp13b0 * 2; + int tmp11 = tmp13a1 + tmp13b1 * 2; + int tmp20 = tmp02a0 + tmp02b0 * 4; + int tmp21 = tmp02a1 + tmp02b1 * 4; + int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4; + int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4; + + tmp00 = tmp00 * 4; + tmp01 = tmp01 * 4; + tmp10 = tmp10 * 4; + tmp11 = tmp11 * 4; + tmp20 = tmp20 * 4; + tmp21 = tmp21 * 4; + tmp30 = tmp30 * 4; + tmp31 = tmp31 * 4; + + tmp[0][m][0] = tmp00; + tmp[0][m][1] = tmp01; + tmp[1][m][0] = tmp10; + tmp[1][m][1] = tmp11; + tmp[2][m][0] = tmp20; + tmp[2][m][1] = tmp21; + tmp[3][m][0] = tmp30; + tmp[3][m][1] = tmp31; + + r0 += max_jj * 6 * 2; + r1 += max_jj * 6 * 2; + r2 += max_jj * 6 * 2; + r3 += max_jj * 6 * 2; + r4 += max_jj * 6 * 2; + r5 += max_jj * 6 * 2; + } + + int* outptr0 = top_blob.channel(i + ii).row(ti * 4) + (tj * 4); + + for (int m = 0; m < 4; m++) + { + if (ti * 4 + m >= outh) + continue; + + int tmp02a0 = tmp[m][1][0] + tmp[m][2][0]; + int tmp02a1 = tmp[m][1][1] + tmp[m][2][1]; + int tmp02b0 = tmp[m][3][0] + tmp[m][4][0]; + int tmp02b1 = tmp[m][3][1] + tmp[m][4][1]; + int tmp13a0 = tmp[m][1][0] - tmp[m][2][0]; + int tmp13a1 = tmp[m][1][1] - tmp[m][2][1]; + int tmp13b0 = tmp[m][3][0] - tmp[m][4][0]; + int tmp13b1 = tmp[m][3][1] - tmp[m][4][1]; + + int tmp00 = tmp02a0 + tmp02b0 + tmp[m][0][0]; + int tmp01 = tmp02a1 + tmp02b1 + tmp[m][0][1]; + int tmp10 = tmp13a0 + tmp13b0 * 2; + int tmp11 = tmp13a1 + tmp13b1 * 2; + int tmp20 = tmp02a0 + tmp02b0 * 4; + int tmp21 = tmp02a1 + tmp02b1 * 4; + int tmp30 = tmp13a0 + tmp13b0 * 8 + tmp[m][5][0]; + int tmp31 = tmp13a1 + tmp13b1 * 8 + tmp[m][5][1]; + + tmp00 = tmp00 / 576; + tmp01 = tmp01 / 576; + tmp10 = tmp10 / 576; + tmp11 = tmp11 / 576; + tmp20 = tmp20 / 576; + tmp21 = tmp21 / 576; + tmp30 = tmp30 / 576; + tmp31 = tmp31 / 576; + + // if (out_elempack == 1) + { + int* outptr1 = outptr0 + N; + + outptr0[0] = tmp00; + outptr1[0] = tmp01; + if (tj * 4 + 1 < outw) + { + outptr0[1] = tmp10; + outptr1[1] = tmp11; + } + if (tj * 4 + 2 < outw) + { + outptr0[2] = tmp20; + outptr1[2] = tmp21; + } + if (tj * 4 + 3 < outw) + { + outptr0[3] = tmp30; + outptr1[3] = tmp31; + } + } + + outptr0 += outw; + } + } + } + for (; ii < max_ii; ii++) + { + int tmp[4][6]; + + int jj = 0; + for (; jj < max_jj; jj++) + { + int ti = (j + jj) / w_tiles; + int tj = (j + jj) % w_tiles; + + const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj; + const int* r1 = r0 + max_jj; + const int* r2 = r0 + max_jj * 2; + const int* r3 = r0 + max_jj * 3; + const int* r4 = r0 + max_jj * 4; + const int* r5 = r0 + max_jj * 5; + + for (int m = 0; m < 5; m++) + { + int tmp02a = r1[0] + r2[0]; + int tmp02b = r3[0] + r4[0]; + int tmp13a = r1[0] - r2[0]; + int tmp13b = r3[0] - r4[0]; + + int tmp0 = tmp02a + tmp02b + r0[0]; + int tmp1 = tmp13a + tmp13b * 2; + int tmp2 = tmp02a + tmp02b * 4; + int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4; + + tmp[0][m] = tmp0; + tmp[1][m] = tmp1; + tmp[2][m] = tmp2; + tmp[3][m] = tmp3; + + r0 += max_jj * 6; + r1 += max_jj * 6; + r2 += max_jj * 6; + r3 += max_jj * 6; + r4 += max_jj * 6; + r5 += max_jj * 6; + } + for (int m = 5; m < 6; m++) + { + int tmp02a = r1[0] + r2[0]; + int tmp02b = r3[0] + r4[0]; + int tmp13a = r1[0] - r2[0]; + int tmp13b = r3[0] - r4[0]; + + int tmp0 = tmp02a + tmp02b + r0[0]; + int tmp1 = tmp13a + tmp13b * 2; + int tmp2 = tmp02a + tmp02b * 4; + int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4; + + tmp0 = tmp0 * 4; + tmp1 = tmp1 * 4; + tmp2 = tmp2 * 4; + tmp3 = tmp3 * 4; + + tmp[0][m] = tmp0; + tmp[1][m] = tmp1; + tmp[2][m] = tmp2; + tmp[3][m] = tmp3; + + r0 += max_jj * 6; + r1 += max_jj * 6; + r2 += max_jj * 6; + r3 += max_jj * 6; + r4 += max_jj * 6; + r5 += max_jj * 6; + } + + int* outptr0 = top_blob.channel(i + ii).row(ti * 4) + (tj * 4); + + for (int m = 0; m < 4; m++) + { + if (ti * 4 + m >= outh) + continue; + + int tmp02a = tmp[m][1] + tmp[m][2]; + int tmp02b = tmp[m][3] + tmp[m][4]; + int tmp13a = tmp[m][1] - tmp[m][2]; + int tmp13b = tmp[m][3] - tmp[m][4]; + + int tmp0 = tmp02a + tmp02b + tmp[m][0]; + int tmp1 = tmp13a + tmp13b * 2; + int tmp2 = tmp02a + tmp02b * 4; + int tmp3 = tmp13a + tmp13b * 8 + tmp[m][5]; + + tmp0 = tmp0 / 576; + tmp1 = tmp1 / 576; + tmp2 = tmp2 / 576; + tmp3 = tmp3 / 576; + + // if (out_elempack == 1) + { + outptr0[0] = tmp0; + if (tj * 4 + 1 < outw) outptr0[1] = tmp1; + if (tj * 4 + 2 < outw) outptr0[2] = tmp2; + if (tj * 4 + 3 < outw) outptr0[3] = tmp3; + } + + outptr0 += outw; + } + } + } +} + +static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) +{ + int outw = top_blob.w; + int outh = top_blob.h; + + // pad to 4n+2, winograd F(4,3) + int w_tiles = (outw + 3) / 4; + int h_tiles = (outh + 3) / 4; + int tiles = w_tiles * h_tiles; + + const int M = top_blob.c * top_blob.elempack; + const int N = tiles; + const int K = bottom_blob.c * bottom_blob.elempack; + const int B = 36; + + // NCNN_LOGE("conv3x3s1_winograd43_int8 %d %d %d", M, N, K); + + int TILE_M, TILE_N, TILE_K; + get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + const int nn_N = (N + TILE_N - 1) / TILE_N; + const int nn_K = (K + TILE_K - 1) / TILE_K; + + // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); + + Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + + const int nn_NK = nn_N * nn_K; + + if (nT > 1 && nn_NK < nT) + { + Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + + for (int ppjk = 0; ppjk < nn_NK; ppjk++) + { + const int ppj = ppjk / nn_K; + const int ppk = ppjk % nn_K; + + const int j = ppj * TILE_N; + const int k = ppk * TILE_K; + + const int max_jj = std::min((N - j), TILE_N); + const int max_kk = std::min((K - k), TILE_K); + + // transform input + conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT); + + Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT); + } + } + else + { + Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(nT) + for (int ppjk = 0; ppjk < nn_NK; ppjk++) + { + const int ppj = ppjk / nn_K; + const int ppk = ppjk % nn_K; + + const int j = ppj * TILE_N; + const int k = ppk * TILE_K; + + const int max_jj = std::min((N - j), TILE_N); + const int max_kk = std::min((K - k), TILE_K); + + Mat B_tile = B_tileX.channel(get_omp_thread_num()); + + // transform input + conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1); + + Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1); + } + } + + Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(nT) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + Mat top_tile = top_tileX.channel(get_omp_thread_num()); + + const int max_ii = std::min((M - i), TILE_M); + + for (int j = 0; j < N; j += TILE_N) + { + const int max_jj = std::min((N - j), TILE_N); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_kk = std::min((K - k), TILE_K); + + const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K); + + const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K); + + bool k_end = k + TILE_K >= K; + + gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end); + } + + // transform output + conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); + } + } +} diff --git a/src/layer/riscv/convolution_im2col_gemm_int8.h b/src/layer/riscv/convolution_im2col_gemm_int8.h new file mode 100644 index 000000000000..4d615e58b4d5 --- /dev/null +++ b/src/layer/riscv/convolution_im2col_gemm_int8.h @@ -0,0 +1,953 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) +{ + // A = (pa, maxk, inch/pa), outch + const int A_hstep = A.w; + + signed char* pp = AT; + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k; + const signed char* p1 = (const signed char*)A + (i + ii + 1) * A_hstep + k; + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + pp[0] = p0[0]; + pp[1] = p0[1]; + pp[2] = p1[0]; + pp[3] = p1[1]; + pp += 4; + p0 += 2; + p1 += 2; + } + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp[1] = p1[0]; + pp += 2; + p0++; + p1++; + } + } + for (; ii < max_ii; ii += 1) + { + const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k; + + int kk = 0; + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp += 1; + p0++; + } + } +} + +static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end) +{ + // NCNN_LOGE("convolution_gemm_transB_packed_tile_int8 %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk); + + const int out_elempack = top_blob.elempack; + const int out_hstep = (int)top_blob.cstep; + + const signed char* pAT = AT_tile; + const signed char* pBT = BT_tile; + + int* outptr = topT_tile; + + int ii = 0; + for (; ii + 1 < max_ii; ii += 2) + { + int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j; + + const signed char* pB = pBT; + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + int sum00; + int sum10; + int sum01; + int sum11; + + if (k == 0) + { + sum00 = 0; + sum10 = 0; + sum01 = 0; + sum11 = 0; + } + else + { + sum00 = outptr[0]; + sum10 = outptr[1]; + sum01 = outptr[2]; + sum11 = outptr[3]; + } + + const signed char* pA = pAT; + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum00 += pA[0] * pB[0]; + sum00 += pA[1] * pB[1]; + sum10 += pA[2] * pB[0]; + sum10 += pA[3] * pB[1]; + sum01 += pA[0] * pB[2]; + sum01 += pA[1] * pB[3]; + sum11 += pA[2] * pB[2]; + sum11 += pA[3] * pB[3]; + pA += 4; + pB += 4; + } + for (; kk < max_kk; kk += 1) + { + sum00 += pA[0] * pB[0]; + sum10 += pA[1] * pB[0]; + sum01 += pA[0] * pB[1]; + sum11 += pA[1] * pB[1]; + pA += 2; + pB += 2; + } + + if (k_end) + { + // if (out_elempack == 1) + { + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr0[out_hstep] = sum10; + outptr0[out_hstep + 1] = sum11; + outptr0 += 2; + } + } + else + { + outptr[0] = sum00; + outptr[1] = sum10; + outptr[2] = sum01; + outptr[3] = sum11; + } + + outptr += 4; + } + for (; jj < max_jj; jj += 1) + { + int sum0; + int sum1; + + if (k == 0) + { + sum0 = 0; + sum1 = 0; + } + else + { + sum0 = outptr[0]; + sum1 = outptr[1]; + } + + const signed char* pA = pAT; + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum0 += pA[0] * pB[0]; + sum0 += pA[1] * pB[1]; + sum1 += pA[2] * pB[0]; + sum1 += pA[3] * pB[1]; + pA += 4; + pB += 2; + } + for (; kk < max_kk; kk += 1) + { + sum0 += pA[0] * pB[0]; + sum1 += pA[1] * pB[0]; + pA += 2; + pB += 1; + } + + if (k_end) + { + // if (out_elempack == 1) + { + outptr0[0] = sum0; + outptr0[out_hstep] = sum1; + outptr0++; + } + } + else + { + outptr[0] = sum0; + outptr[1] = sum1; + } + + outptr += 2; + } + + pAT += max_kk * 2; + } + for (; ii < max_ii; ii += 1) + { + int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j; + + const signed char* pB = pBT; + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + int sum0; + int sum1; + + if (k == 0) + { + sum0 = 0; + sum1 = 0; + } + else + { + sum0 = outptr[0]; + sum1 = outptr[1]; + } + + const signed char* pA = pAT; + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + sum0 += pA[0] * pB[0]; + sum0 += pA[1] * pB[1]; + sum1 += pA[0] * pB[2]; + sum1 += pA[1] * pB[3]; + pA += 2; + pB += 4; + } + for (; kk < max_kk; kk += 1) + { + sum0 += pA[0] * pB[0]; + sum1 += pA[0] * pB[1]; + pA += 1; + pB += 2; + } + + if (k_end) + { + // if (out_elempack == 1) + { + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0 += 2; + } + } + else + { + outptr[0] = sum0; + outptr[1] = sum1; + } + + outptr += 2; + } + for (; jj < max_jj; jj += 1) + { + int sum; + + if (k == 0) + { + sum = 0; + } + else + { + sum = outptr[0]; + } + + const signed char* pA = pAT; + int kk = 0; + for (; kk < max_kk; kk += 1) + { + sum += pA[0] * pB[0]; + pA += 1; + pB += 1; + } + + if (k_end) + { + // if (out_elempack == 1) + { + outptr0[0] = sum; + outptr0++; + } + } + else + { + outptr[0] = sum; + } + + outptr += 1; + } + + pAT += max_kk; + } +} + +static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) +{ + // resolve optimal tile size from cache size + const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char)); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + + // solve K + { + // try not to split K + int tile_size = (l2_cache_size_int8 - 2) / 3; + TILE_K = std::max(2, tile_size / 2 * 2); + + int nn_K = (K + TILE_K - 1) / TILE_K; + TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2); + } + + // solve M + { + int nn_M = (M + 7) / 8; + TILE_M = std::max(2, ((M + nn_M - 1) / nn_M + 1) / 2 * 2); + } + + { + TILE_M *= std::min(nT, get_physical_cpu_count()); + + int nn_M = (M + TILE_M - 1) / TILE_M; + TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2); + + if (nT > 1) + { + TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2); + } + } + + if (N > 0) + { + int tile_size; + if (TILE_K >= K) + { + tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / TILE_K; + } + else + { + tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 4 + TILE_K); + } + TILE_N = std::max(1, tile_size); + + int nn_N = (N + TILE_N - 1) / TILE_N; + TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N); + } +} + +static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) +{ + const int elempack = bottom_blob.elempack; + + signed char* pp = B; + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + if (elempack == 1) + { + const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj); + + int kk = 0; + for (; kk + 1 < max_kk; kk += 2) + { + pp[0] = p0[0]; + pp[1] = p0[bottom_blob.cstep]; + pp[2] = p0[1]; + pp[3] = p0[bottom_blob.cstep + 1]; + pp += 4; + p0 += bottom_blob.cstep * 2; + } + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp[1] = p0[1]; + pp += 2; + p0 += bottom_blob.cstep; + } + } + } + for (; jj < max_jj; jj++) + { + if (elempack == 1) + { + const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj); + + int kk = 0; + for (; kk < max_kk; kk++) + { + pp[0] = p0[0]; + pp += 1; + p0 += bottom_blob.cstep; + } + } + } +} + +template +void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) +{ + const int w = bottom_blob.w; + // const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int outw = (w - kernel_extent_w) / stride_w + 1; + + // j max_jj outw*outh split w and h + + // k max_kk pa*maxk*(inch/pa) split inch + + // k/max_kk shall be multiple of maxk + + const int maxk = kernel_w * kernel_h; + + signed char* pp = B; + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + int dy0 = (j + jj) / outw; + int dy1 = (j + jj + 1) / outw; + int dx0 = (j + jj) % outw; + int dx1 = (j + jj + 1) % outw; + + if (dy0 == dy1) + { + int kk = 0; + if (elempack == 1) + { + for (; kk + 1 < max_kk; kk += 2) + { + int p0 = (k + kk) / maxk; + int p1 = (k + kk + 1) / maxk; + int uv0 = (k + kk) % maxk; + int uv1 = (k + kk + 1) % maxk; + int u0 = uv0 / kernel_w; + int u1 = uv1 / kernel_w; + int v0 = uv0 % kernel_w; + int v1 = uv1 % kernel_w; + + const Mat img0 = bottom_blob.channel(p0); + const Mat img1 = bottom_blob.channel(p1); + + int x00 = stride_w * dx0 + dilation_w * v0; + int y00 = stride_h * dy0 + dilation_h * u0; + int x10 = stride_w * dx0 + dilation_w * v1; + int y10 = stride_h * dy0 + dilation_h * u1; + + const signed char* sptr0 = img0.row(y00) + x00; + const signed char* sptr1 = img1.row(y10) + x10; + + pp[0] = sptr0[0]; + pp[1] = sptr1[0]; + pp[2] = sptr0[stride_w]; + pp[3] = sptr1[stride_w]; + pp += 4; + } + } + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x0 = stride_w * dx0 + dilation_w * v; + int y0 = stride_h * dy0 + dilation_h * u; + + const signed char* sptr = img.row(y0) + x0 * elempack; + + if (elempack == 1) + { + pp[0] = sptr[0]; + pp[1] = sptr[stride_w]; + pp += 2; + } + } + } + else + { + int kk = 0; + if (elempack == 1) + { + for (; kk + 1 < max_kk; kk += 2) + { + int p0 = (k + kk) / maxk; + int p1 = (k + kk + 1) / maxk; + int uv0 = (k + kk) % maxk; + int uv1 = (k + kk + 1) % maxk; + int u0 = uv0 / kernel_w; + int u1 = uv1 / kernel_w; + int v0 = uv0 % kernel_w; + int v1 = uv1 % kernel_w; + + const Mat img0 = bottom_blob.channel(p0); + const Mat img1 = bottom_blob.channel(p1); + + int x00 = stride_w * dx0 + dilation_w * v0; + int x01 = stride_w * dx1 + dilation_w * v0; + int y00 = stride_h * dy0 + dilation_h * u0; + int y01 = stride_h * dy1 + dilation_h * u0; + int x10 = stride_w * dx0 + dilation_w * v1; + int x11 = stride_w * dx1 + dilation_w * v1; + int y10 = stride_h * dy0 + dilation_h * u1; + int y11 = stride_h * dy1 + dilation_h * u1; + + const signed char* sptr00 = img0.row(y00) + x00; + const signed char* sptr01 = img0.row(y01) + x01; + const signed char* sptr10 = img1.row(y10) + x10; + const signed char* sptr11 = img1.row(y11) + x11; + + pp[0] = sptr00[0]; + pp[1] = sptr10[0]; + pp[2] = sptr01[0]; + pp[3] = sptr11[0]; + pp += 4; + } + } + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x0 = stride_w * dx0 + dilation_w * v; + int x1 = stride_w * dx1 + dilation_w * v; + int y0 = stride_h * dy0 + dilation_h * u; + int y1 = stride_h * dy1 + dilation_h * u; + + const signed char* sptr0 = img.row(y0) + x0 * elempack; + const signed char* sptr1 = img.row(y1) + x1 * elempack; + + if (elempack == 1) + { + pp[0] = sptr0[0]; + pp[1] = sptr1[0]; + pp += 2; + } + } + } + } + for (; jj < max_jj; jj++) + { + int dy = (j + jj) / outw; + int dx = (j + jj) % outw; + + int kk = 0; + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x = stride_w * dx + dilation_w * v; + int y = stride_h * dy + dilation_h * u; + + const signed char* sptr = img.row(y) + x * elempack; + + if (elempack == 1) + { + pp[0] = sptr[0]; + pp += 1; + } + } + } +} + +template void convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); +template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); +template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); +template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); +template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); +template void convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk); + +static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h) +{ + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 1 && kernel_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk); + return; + } + + const int w = bottom_blob.w; + // const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int outw = (w - kernel_extent_w) / stride_w + 1; + + // j max_jj outw*outh split w and h + + // k max_kk pa*maxk*(inch/pa) split inch + + // k/max_kk shall be multiple of maxk + + const int maxk = kernel_w * kernel_h; + + signed char* pp = B; + + int jj = 0; + for (; jj + 1 < max_jj; jj += 2) + { + int dy0 = (j + jj) / outw; + int dy1 = (j + jj + 1) / outw; + int dx0 = (j + jj) % outw; + int dx1 = (j + jj + 1) % outw; + + if (dy0 == dy1) + { + int kk = 0; + if (elempack == 1) + { + for (; kk + 1 < max_kk; kk += 2) + { + int p0 = (k + kk) / maxk; + int p1 = (k + kk + 1) / maxk; + int uv0 = (k + kk) % maxk; + int uv1 = (k + kk + 1) % maxk; + int u0 = uv0 / kernel_w; + int u1 = uv1 / kernel_w; + int v0 = uv0 % kernel_w; + int v1 = uv1 % kernel_w; + + const Mat img0 = bottom_blob.channel(p0); + const Mat img1 = bottom_blob.channel(p1); + + int x00 = stride_w * dx0 + dilation_w * v0; + int y00 = stride_h * dy0 + dilation_h * u0; + int x10 = stride_w * dx0 + dilation_w * v1; + int y10 = stride_h * dy0 + dilation_h * u1; + + const signed char* sptr0 = img0.row(y00) + x00; + const signed char* sptr1 = img1.row(y10) + x10; + + pp[0] = sptr0[0]; + pp[1] = sptr1[0]; + pp[2] = sptr0[stride_w]; + pp[3] = sptr1[stride_w]; + pp += 4; + } + } + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x0 = stride_w * dx0 + dilation_w * v; + int y0 = stride_h * dy0 + dilation_h * u; + + const signed char* sptr = img.row(y0) + x0 * elempack; + + if (elempack == 1) + { + pp[0] = sptr[0]; + pp[1] = sptr[stride_w]; + pp += 2; + } + } + } + else + { + int kk = 0; + if (elempack == 1) + { + for (; kk + 1 < max_kk; kk += 2) + { + int p0 = (k + kk) / maxk; + int p1 = (k + kk + 1) / maxk; + int uv0 = (k + kk) % maxk; + int uv1 = (k + kk + 1) % maxk; + int u0 = uv0 / kernel_w; + int u1 = uv1 / kernel_w; + int v0 = uv0 % kernel_w; + int v1 = uv1 % kernel_w; + + const Mat img0 = bottom_blob.channel(p0); + const Mat img1 = bottom_blob.channel(p1); + + int x00 = stride_w * dx0 + dilation_w * v0; + int x01 = stride_w * dx1 + dilation_w * v0; + int y00 = stride_h * dy0 + dilation_h * u0; + int y01 = stride_h * dy1 + dilation_h * u0; + int x10 = stride_w * dx0 + dilation_w * v1; + int x11 = stride_w * dx1 + dilation_w * v1; + int y10 = stride_h * dy0 + dilation_h * u1; + int y11 = stride_h * dy1 + dilation_h * u1; + + const signed char* sptr00 = img0.row(y00) + x00; + const signed char* sptr01 = img0.row(y01) + x01; + const signed char* sptr10 = img1.row(y10) + x10; + const signed char* sptr11 = img1.row(y11) + x11; + + pp[0] = sptr00[0]; + pp[1] = sptr10[0]; + pp[2] = sptr01[0]; + pp[3] = sptr11[0]; + pp += 4; + } + } + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x0 = stride_w * dx0 + dilation_w * v; + int x1 = stride_w * dx1 + dilation_w * v; + int y0 = stride_h * dy0 + dilation_h * u; + int y1 = stride_h * dy1 + dilation_h * u; + + const signed char* sptr0 = img.row(y0) + x0 * elempack; + const signed char* sptr1 = img.row(y1) + x1 * elempack; + + if (elempack == 1) + { + pp[0] = sptr0[0]; + pp[1] = sptr1[0]; + pp += 2; + } + } + } + } + for (; jj < max_jj; jj++) + { + int dy = (j + jj) / outw; + int dx = (j + jj) % outw; + + int kk = 0; + for (; kk < max_kk / elempack; kk++) + { + int p = (k / elempack + kk) / maxk; + int uv = (k / elempack + kk) % maxk; + int u = uv / kernel_w; + int v = uv % kernel_w; + + const Mat img = bottom_blob.channel(p); + + int x = stride_w * dx + dilation_w * v; + int y = stride_h * dy + dilation_h * u; + + const signed char* sptr = img.row(y) + x * elempack; + + if (elempack == 1) + { + pp[0] = sptr[0]; + pp += 1; + } + } + } +} + +static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt) +{ + // NCNN_LOGE("convolution_im2col_gemm_transform_kernel"); + const int maxk = kernel_w * kernel_h; + + const int M = outch; + const int K = inch * maxk; + + int TILE_M, TILE_N, TILE_K; + convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + + int elempack = 1; + + // maxk-inch-outch to pa-maxk-inch/pa-outch + Mat A_data; + if (maxk == 1) + { + A_data = kernel.reshape(maxk * inch, outch); + } + else + { + Mat weight_data_r2 = kernel.reshape(maxk, inch, outch); + + A_data.create(maxk * inch, outch, (size_t)1u, 1); + + for (int q = 0; q < outch; q += 1) + { + signed char* g00 = A_data.row(q); + + for (int p = 0; p + (elempack - 1) < inch; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + const signed char* k00 = weight_data_r2.channel(q).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + + AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)1u, 1); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + const int max_ii = std::min((M - i), TILE_M); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_kk = std::min((K - k), TILE_K); + + Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1); + + convolution_im2col_pack_A_tile_int8(A_data, AT_tile, i, max_ii, k, max_kk); + } + } +} + +static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + + const int M = top_blob.c * top_blob.elempack; + const int N = top_blob.w * top_blob.h; + const int K = bottom_blob.c * bottom_blob.elempack * maxk; + + int TILE_M, TILE_N, TILE_K; + convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT); + + const int nn_M = (M + TILE_M - 1) / TILE_M; + const int nn_N = (N + TILE_N - 1) / TILE_N; + const int nn_K = (K + TILE_K - 1) / TILE_K; + + // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); + + Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator); + + const int nn_NK = nn_N * nn_K; + + #pragma omp parallel for num_threads(nT) + for (int ppjk = 0; ppjk < nn_NK; ppjk++) + { + const int ppj = ppjk / nn_K; + const int ppk = ppjk % nn_K; + + const int j = ppj * TILE_N; + const int k = ppk * TILE_K; + + const int max_jj = std::min((N - j), TILE_N); + const int max_kk = std::min((K - k), TILE_K); + + Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1); + + // im2col + convolution_im2col_input_tile_int8(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h); + } + + Mat topT_tileX; + if (K > TILE_K) + topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(nT) + for (int ppj = 0; ppj < nn_M; ppj++) + { + const int i = ppj * TILE_M; + + Mat topT_tile; + if (K > TILE_K) + topT_tile = topT_tileX.channel(get_omp_thread_num()); + + const int max_ii = std::min((M - i), TILE_M); + + for (int j = 0; j < N; j += TILE_N) + { + const int max_jj = std::min((N - j), TILE_N); + + for (int k = 0; k < K; k += TILE_K) + { + const int max_kk = std::min((K - k), TILE_K); + + const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1); + + const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1); + + bool k_end = k + TILE_K >= K; + + convolution_gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end); + } + } + } +} diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h new file mode 100644 index 000000000000..a275f05c6c7e --- /dev/null +++ b/src/layer/riscv/convolution_packed_int8.h @@ -0,0 +1,398 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + + // clang-format off + // *INDENT-OFF* + if (outch >= 2) + { + if (inch >= 2) + kernel_tm.create(maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)4u, 4); + else + kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2); + } + else + { + if (inch >= 2) + kernel_tm.create(maxk, inch / 2 + inch % 2, outch, (size_t)2u, 2); + else + kernel_tm.create(maxk, inch, outch, (size_t)1u, 1); + } + // *INDENT-ON* + // clang-format on + + int q = 0; + for (; q + 1 < outch; q += 2) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + signed char* g00 = kernel_tm.channel(q / 2); + + int p = 0; + for (; p + 1 < inch; p += 2) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k0[maxk]; + g00[3] = k1[maxk]; + g00 += 4; + } + + kptr0 += maxk * 2; + kptr1 += maxk * 2; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00 += 2; + } + } + } + for (; q < outch; q++) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; + signed char* g00 = kernel_tm.channel(q / 2 + q % 2); + + int p = 0; + for (; p + 1 < inch; p += 2) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + + g00[0] = k0[0]; + g00[1] = k0[maxk]; + g00 += 2; + } + + kptr += maxk * 2; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + + g00[0] = k0[0]; + g00++; + } + } + } +} + +static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + const int w = bottom_blob.w; + const int elempack = bottom_blob.elempack; + const int inch = bottom_blob.c * elempack; + + const int N = bottom_blob.cstep * elempack; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int outch = top_blob.c * out_elempack; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2 * elempack; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = (outch - remain_outch_start) / 2; + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 2; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + const signed char* kptr = weight_data_tm.channel(p / 2); + + int q = 0; + for (; q + 1 < inch; q += 2) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum00 += r0s[0] * kptr[0]; + sum10 += r0s[0] * kptr[1]; + sum00 += r0s[N] * kptr[2]; + sum10 += r0s[N] * kptr[3]; + sum01 += r1s[0] * kptr[0]; + sum11 += r1s[0] * kptr[1]; + sum01 += r1s[N] * kptr[2]; + sum11 += r1s[N] * kptr[3]; + + kptr += 4; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum00 += r0s[0] * kptr[0]; + sum10 += r0s[0] * kptr[1]; + sum01 += r1s[0] * kptr[0]; + sum11 += r1s[0] * kptr[1]; + + kptr += 2; + } + } + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr0 += 2; + outptr1 += 2; + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum0 = 0; + int sum1 = 0; + + const signed char* kptr = weight_data_tm.channel(p / 2); + + int q = 0; + for (; q + 1 < inch; q += 2) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r0s[0] * kptr[1]; + sum0 += r0s[N] * kptr[2]; + sum1 += r0s[N] * kptr[3]; + + kptr += 4; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r0s[0] * kptr[1]; + + kptr += 2; + } + } + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr0 += 1; + outptr1 += 1; + } + } + remain_outch_start += nn_outch * 2; + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int sum0 = 0; + int sum1 = 0; + + const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); + + int q = 0; + for (; q + 1 < inch; q += 2) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum0 += r0s[N] * kptr[1]; + sum1 += r1s[0] * kptr[0]; + sum1 += r1s[N] * kptr[1]; + + kptr += 2; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r1s[0] * kptr[0]; + + kptr += 1; + } + } + } + + outptr[0] = sum0; + outptr[1] = sum1; + outptr += 2; + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum = 0; + + const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); + + int q = 0; + for (; q + 1 < inch; q += 2) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum += r0s[0] * kptr[0]; + sum += r0s[N] * kptr[1]; + + kptr += 2; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum += r0s[0] * kptr[0]; + + kptr += 1; + } + } + } + + outptr[0] = sum; + outptr += 1; + } + } +} diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index be413e5be252..b6e470ceae8a 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -35,6 +35,13 @@ namespace ncnn { #include "convolution_1x1.h" #include "convolution_3x3.h" +#if NCNN_INT8 +#include "convolution_packed_int8.h" +#include "convolution_im2col_gemm_int8.h" + +#include "convolution_3x3_winograd_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolution_packn.h" #include "convolution_pack1ton.h" @@ -133,8 +140,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - // TODO implement int8 - return 0; + return create_pipeline_int8_riscv(opt); } #endif @@ -279,7 +285,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti Option opt_unpacked = opt; opt_unpacked.use_packing_layout = false; - return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif @@ -1102,4 +1108,137 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8); + + if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if (opt.use_winograd43_convolution) + conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt); + else + conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); + } + else + { + convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + // NCNN_LOGE("Convolution_riscv input %dx%d ksize=%dx%d stride=%dx%d", + // bottom_blob.w, bottom_blob.h, kernel_w, kernel_h, stride_w, stride_h); + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int channels = bottom_blob_bordered.c; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + // NCNN_LOGE("forward_int8_riscv %dx%dx%d elempack=%d out_elempack=%d int8_scale_term=%d", + // w, h, bottom_blob_bordered.c, elempack, out_elempack, int8_scale_term); + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + + int out_elempack_int32 = 1; + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8); + + int _nT = opt.num_threads; + + if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) + conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); + else + conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + } + else + { + convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + // if (use_int8_requantize) + // { + // requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + // } + // else + // { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + // } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index a4e008c9dd1d..3aa5bcc72587 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -37,17 +37,25 @@ class Convolution_riscv : public Convolution int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8_riscv(const Option& opt); + int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif public: Layer* activation; Mat weight_data_tm; + Mat weight_sgemm_data; Mat weight_winograd23_data; Mat weight_winograd43_data; Mat weight_winograd63_data; // fp16 Mat bias_data_fp16; +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn From 7cb209189a3a7cb41ed72a6fbd52e3045723e5ad Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sat, 9 Dec 2023 12:26:33 +0800 Subject: [PATCH 02/10] riscv int8 innerproduct Signed-off-by: Molly Sophia --- src/layer/riscv/innerproduct_riscv.cpp | 178 ++++++++++++++++++++++++- src/layer/riscv/innerproduct_riscv.h | 9 ++ 2 files changed, 184 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index accfc683584f..c0d22817710a 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -52,8 +52,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - // TODO implement int8 - return 0; + return create_pipeline_int8_riscv(opt); } #endif @@ -148,7 +147,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt Option opt_unpacked = opt; opt_unpacked.use_packing_layout = false; - return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif @@ -1090,4 +1089,177 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int InnerProduct_riscv::create_pipeline_int8_riscv(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g0 = weight_data_tm.row(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input) + { + // gemm + Mat bottom_blob_int8_unpacked; + Option opt_unpack = opt; + opt_unpack.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + + int h = bottom_blob_int8_unpacked.h; + + int out_elempack = 1; + + int outh = h / out_elempack; + + top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; + + if (num_output_elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + int sum = 0; + + int i = 0; + for (; i < num_input; i++) + { + sum += *m++ * *kptr++; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + outptr[0] = activation_ss(sumfp32, activation_type, activation_params); + outptr += 1; + } + } + } + + return 0; + } + + Mat bottom_blob_int8_flattened = bottom_blob_int8; + if (bottom_blob_int8.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + } + + // int elempack = bottom_blob_int8_flattened.elempack; + + int out_elempack = 1; + // size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + int sum = 0; + + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int i = 0; + for (; i < num_input; i++) + { + signed char val = sptr[0]; + + signed char w = kptr[0]; + + sum += val * w; + + sptr += 1; + kptr += 1; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + top_blob[p] = sumfp32; + } + } + + return 0; +} +#endif // NCNN_INT8 } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index d3056d5801d0..9b44bf8a3cca 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8_riscv(const Option& opt); + int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* flatten; @@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn From bd2fd74e5ffbd7039dd0e98a1bc2b3e9f91d6dc9 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 22 Dec 2023 08:25:35 +0800 Subject: [PATCH 03/10] rvv conv int8 Signed-off-by: Molly Sophia --- src/layer/riscv/convolution_1x1_int8.h | 26 + src/layer/riscv/convolution_riscv.cpp | 48 +- src/layer/riscv/convolution_sgemm_int8.h | 628 +++++++++++++++ .../riscv/convolution_sgemm_packnto1_int8.h | 720 ++++++++++++++++++ 4 files changed, 1400 insertions(+), 22 deletions(-) create mode 100644 src/layer/riscv/convolution_1x1_int8.h create mode 100644 src/layer/riscv/convolution_sgemm_int8.h create mode 100644 src/layer/riscv/convolution_sgemm_packnto1_int8.h diff --git a/src/layer/riscv/convolution_1x1_int8.h b/src/layer/riscv/convolution_1x1_int8.h new file mode 100644 index 000000000000..6d0f546d25c1 --- /dev/null +++ b/src/layer/riscv/convolution_1x1_int8.h @@ -0,0 +1,26 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_int8_rvv(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index b6e470ceae8a..99f71f0f0bab 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -37,9 +37,10 @@ namespace ncnn { #if NCNN_INT8 #include "convolution_packed_int8.h" -#include "convolution_im2col_gemm_int8.h" #include "convolution_3x3_winograd_int8.h" +#include "convolution_sgemm_int8.h" +#include "convolution_1x1_int8.h" #endif // NCNN_INT8 #if __riscv_vector @@ -1116,7 +1117,11 @@ int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt) bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8); - if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { if (opt.use_winograd43_convolution) conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt); @@ -1125,7 +1130,8 @@ int Convolution_riscv::create_pipeline_int8_riscv(const Option& opt) } else if (opt.use_sgemm_convolution) { - convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); + convolution_im2col_sgemm_transform_kernel_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + // convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); } else { @@ -1165,9 +1171,6 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); } - // NCNN_LOGE("Convolution_riscv input %dx%d ksize=%dx%d stride=%dx%d", - // bottom_blob.w, bottom_blob.h, kernel_w, kernel_h, stride_w, stride_h); - Mat bottom_blob_bordered; make_padding(bottom_blob_int8, bottom_blob_bordered, opt); if (bottom_blob_bordered.empty()) @@ -1188,9 +1191,6 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, int out_elempack = 1; size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; - // NCNN_LOGE("forward_int8_riscv %dx%dx%d elempack=%d out_elempack=%d int8_scale_term=%d", - // w, h, bottom_blob_bordered.c, elempack, out_elempack, int8_scale_term); - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -1207,7 +1207,11 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, int _nT = opt.num_threads; - if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + } + else if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); @@ -1216,26 +1220,26 @@ int Convolution_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, } else if (opt.use_sgemm_convolution) { - convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + convolution_im2col_sgemm_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); } else { convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); } - // if (use_int8_requantize) - // { - // requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); - // } - // else - // { - dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); - - if (activation) + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else { - activation->forward_inplace(top_blob, opt); + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } } - // } return 0; } diff --git a/src/layer/riscv/convolution_sgemm_int8.h b/src/layer/riscv/convolution_sgemm_int8.h new file mode 100644 index 000000000000..276be4e92ca7 --- /dev/null +++ b/src/layer/riscv/convolution_sgemm_int8.h @@ -0,0 +1,628 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_int8_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ +#if __riscv_vector + int packn = csrr_vlenb(); + size_t vl = vsetvl_e8m1(packn); +#else + int packn = 4; + size_t vl = 4; +#endif + + // Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= packn) + tmp.create(packn * maxk, inch, size / packn + size % packn, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + { + int nn_size = size / packn; + int remain_size_start = nn_size * packn; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = ii * packn; + + int8_t* tmpptr = tmp.channel(i / packn); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { +#if __riscv_vector + vse8_v_i8m1(tmpptr, vle8_v_i8m1(img0, vl), vl); +#else + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + tmpptr[2] = img0[2]; + tmpptr[3] = img0[3]; +#endif + img0 += size; + tmpptr += packn; + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int8_t* tmpptr = tmp.channel(i / packn + i % packn); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + img0 += size; + tmpptr += 1; + } + } + } + } + + +#if __riscv_vector + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + int* outptr4 = top_blob.channel(p + 4); + int* outptr5 = top_blob.channel(p + 5); + int* outptr6 = top_blob.channel(p + 6); + int* outptr7 = top_blob.channel(p + 7); + + int i = 0; + for (; i + (packn - 1) < size; i += packn) + { + const int8_t* tmpptr = tmp.channel(i / packn); + const int8_t* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum1_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum2_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum3_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum4_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum5_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum6_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum7_32 = vmv_v_x_i32m4(0, vl); + + for (int q = 0; q < nn; q++) + { + vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl); + _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl); + _sum1_32 = vwmacc_vx_i32m4(_sum1_32, kptr[1], _val, vl); + _sum2_32 = vwmacc_vx_i32m4(_sum2_32, kptr[2], _val, vl); + _sum3_32 = vwmacc_vx_i32m4(_sum3_32, kptr[3], _val, vl); + _sum4_32 = vwmacc_vx_i32m4(_sum4_32, kptr[4], _val, vl); + _sum5_32 = vwmacc_vx_i32m4(_sum5_32, kptr[5], _val, vl); + _sum6_32 = vwmacc_vx_i32m4(_sum6_32, kptr[6], _val, vl); + _sum7_32 = vwmacc_vx_i32m4(_sum7_32, kptr[7], _val, vl); + tmpptr += packn; + kptr += 8; + } + + vse32_v_i32m4(outptr0, _sum0_32, vl); + vse32_v_i32m4(outptr1, _sum1_32, vl); + vse32_v_i32m4(outptr2, _sum2_32, vl); + vse32_v_i32m4(outptr3, _sum3_32, vl); + vse32_v_i32m4(outptr4, _sum4_32, vl); + vse32_v_i32m4(outptr5, _sum5_32, vl); + vse32_v_i32m4(outptr6, _sum6_32, vl); + vse32_v_i32m4(outptr7, _sum7_32, vl); + + outptr0 += packn; + outptr1 += packn; + outptr2 += packn; + outptr3 += packn; + outptr4 += packn; + outptr5 += packn; + outptr6 += packn; + outptr7 += packn; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / packn + i % packn); + const int8_t* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + int sum7 = 0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + sum4 += tmpptr[0] * kptr[4]; + sum5 += tmpptr[0] * kptr[5]; + sum6 += tmpptr[0] * kptr[6]; + sum7 += tmpptr[0] * kptr[7]; + tmpptr++; + kptr += 8; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + outptr4[0] = sum4; + outptr5[0] = sum5; + outptr6[0] = sum6; + outptr7[0] = sum7; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + outptr4++; + outptr5++; + outptr6++; + outptr7++; + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + (packn - 1) < size; i += packn) + { + const int8_t* tmpptr = tmp.channel(i / packn); + const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum1_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum2_32 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum3_32 = vmv_v_x_i32m4(0, vl); + + for (int q = 0; q < nn; q++) + { + vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl); + _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl); + _sum1_32 = vwmacc_vx_i32m4(_sum1_32, kptr[1], _val, vl); + _sum2_32 = vwmacc_vx_i32m4(_sum2_32, kptr[2], _val, vl); + _sum3_32 = vwmacc_vx_i32m4(_sum3_32, kptr[3], _val, vl); + + tmpptr += packn; + kptr += 4; + } + + vse32_v_i32m4(outptr0, _sum0_32, vl); + vse32_v_i32m4(outptr1, _sum1_32, vl); + vse32_v_i32m4(outptr2, _sum2_32, vl); + vse32_v_i32m4(outptr3, _sum3_32, vl); + + outptr0 += packn; + outptr1 += packn; + outptr2 += packn; + outptr3 += packn; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / packn + i % packn); + const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + tmpptr++; + kptr += 4; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + } + + remain_outch_start += nn_outch << 2; +#else + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + const int8_t* tmpptr = tmp.channel(i / 4); + const int8_t* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + int sum00 = 0; + int sum01 = 0; + int sum02 = 0; + int sum03 = 0; + int sum10 = 0; + int sum11 = 0; + int sum12 = 0; + int sum13 = 0; + + for (int q = 0; q < nn; q++) + { + int8_t k0 = kptr[0]; + int8_t k1 = kptr[1]; + sum00 += tmpptr[0] * k0; + sum01 += tmpptr[1] * k0; + sum02 += tmpptr[2] * k0; + sum03 += tmpptr[3] * k0; + sum10 += tmpptr[0] * k1; + sum11 += tmpptr[1] * k1; + sum12 += tmpptr[2] * k1; + sum13 += tmpptr[3] * k1; + tmpptr += 4; + kptr += 2; + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr0[2] = sum02; + outptr0[3] = sum03; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr1[2] = sum12; + outptr1[3] = sum13; + + outptr0 += 4; + outptr1 += 4; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / 4 + i % 4); + const int8_t* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + int sum0 = 0; + int sum1 = 0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + tmpptr++; + kptr += 2; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + + outptr0++; + outptr1++; + } + } +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + (packn - 1) < size; i += packn) + { + const int8_t* tmpptr = tmp.channel(i / packn); +#if __riscv_vector + const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const int8_t* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + +#if __riscv_vector + vint32m4_t _sum0_32 = vmv_v_x_i32m4(0, vl); +#else + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; +#endif + + for (int q = 0; q < nn; q++) + { +#if __riscv_vector + vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(tmpptr, vl), vl); + _sum0_32 = vwmacc_vx_i32m4(_sum0_32, kptr[0], _val, vl); +#else + int8_t k0 = kptr[0]; + sum0 += tmpptr[0] * k0; + sum1 += tmpptr[1] * k0; + sum2 += tmpptr[2] * k0; + sum3 += tmpptr[3] * k0; +#endif + tmpptr += packn; + kptr++; + } + +#if __riscv_vector + vse32_v_i32m4(outptr0, _sum0_32, vl); +#else + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; +#endif + outptr0 += packn; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / packn + i % packn); +#if __riscv_vector + const int8_t* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const int8_t* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + + int sum0 = 0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + tmpptr++; + kptr++; + } + + outptr0[0] = sum0; + + outptr0++; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_int8_rvv(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8b-maxk-inch-outch/8b + Mat kernel = _kernel.reshape(maxk, inch, outch); +#if __riscv_vector + kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4, (size_t)1u); +#else + kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u); +#endif + int q = 0; +#if __riscv_vector + for (; q + 7 < outch; q += 8) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + const Mat k4 = kernel.channel(q + 4); + const Mat k5 = kernel.channel(q + 5); + const Mat k6 = kernel.channel(q + 6); + const Mat k7 = kernel.channel(q + 7); + + int8_t* g00 = kernel_tm.channel(q / 8); + + for (int p = 0; p < inch; p++) + { + const int8_t* k00 = (const int8_t*)k0.row(p); + const int8_t* k10 = (const int8_t*)k1.row(p); + const int8_t* k20 = (const int8_t*)k2.row(p); + const int8_t* k30 = (const int8_t*)k3.row(p); + const int8_t* k40 = (const int8_t*)k4.row(p); + const int8_t* k50 = (const int8_t*)k5.row(p); + const int8_t* k60 = (const int8_t*)k6.row(p); + const int8_t* k70 = (const int8_t*)k7.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = (int8_t)k00[k]; + g00[1] = (int8_t)k10[k]; + g00[2] = (int8_t)k20[k]; + g00[3] = (int8_t)k30[k]; + g00[4] = (int8_t)k40[k]; + g00[5] = (int8_t)k50[k]; + g00[6] = (int8_t)k60[k]; + g00[7] = (int8_t)k70[k]; + + g00 += 8; + } + } + } + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + + int8_t* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4); + + for (int p = 0; p < inch; p++) + { + const int8_t* k00 = (const int8_t*)k0.row(p); + const int8_t* k10 = (const int8_t*)k1.row(p); + const int8_t* k20 = (const int8_t*)k2.row(p); + const int8_t* k30 = (const int8_t*)k3.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = (int8_t)k00[k]; + g00[1] = (int8_t)k10[k]; + g00[2] = (int8_t)k20[k]; + g00[3] = (int8_t)k30[k]; + + g00 += 4; + } + } + } +#else + for (; q + 1 < outch; q += 2) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + + int8_t* g00 = kernel_tm.channel(q / 2); + + for (int p = 0; p < inch; p++) + { + const int8_t* k00 = (const int8_t*)k0.row(p); + const int8_t* k10 = (const int8_t*)k1.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + + g00 += 2; + } + } + } +#endif + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + +#if __riscv_vector + int8_t* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + int8_t* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + for (int p = 0; p < inch; p++) + { + const int8_t* k00 = (const int8_t*)k0.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = (int8_t)k00[k]; + + g00 += 1; + } + } + } +} + +static void convolution_im2col_sgemm_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int8_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int8_t* sptr = (const int8_t*)img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_int8_rvv(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/riscv/convolution_sgemm_packnto1_int8.h b/src/layer/riscv/convolution_sgemm_packnto1_int8.h new file mode 100644 index 000000000000..d25b968c702a --- /dev/null +++ b/src/layer/riscv/convolution_sgemm_packnto1_int8.h @@ -0,0 +1,720 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_packnto1_int8_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + const int packn = csrr_vlenb(); + const size_t vl = vsetvl_e8m1(packn); + + // Mat bottom_im2col(size, maxk, inch, 1u * packn, packn, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + Mat tmp; + if (size >= 8) + tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator); + else if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator); + else if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u * packn, packn, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u * packn, packn, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + + int8_t* tmpptr = tmp.channel(i / 8); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn; + + for (int k = 0; k < maxk; k++) + { +#if C906 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr[4] = img0[l + packn * 4]; + tmpptr[5] = img0[l + packn * 5]; + tmpptr[6] = img0[l + packn * 6]; + tmpptr[7] = img0[l + packn * 7]; + tmpptr += 8; + } + + img0 += size * packn; +#else + vint8m1_t _val0 = vle8_v_i8m1(img0, vl); + vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl); + vint8m1_t _val2 = vle8_v_i8m1(img0 + packn * 2, vl); + vint8m1_t _val3 = vle8_v_i8m1(img0 + packn * 3, vl); + vint8m1_t _val4 = vle8_v_i8m1(img0 + packn * 4, vl); + vint8m1_t _val5 = vle8_v_i8m1(img0 + packn * 5, vl); + vint8m1_t _val6 = vle8_v_i8m1(img0 + packn * 6, vl); + vint8m1_t _val7 = vle8_v_i8m1(img0 + packn * 7, vl); + vsseg8e8_v_i8m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + + img0 += size * packn; + tmpptr += packn * 8; +#endif + } + } + } + + remain_size_start += nn_size << 3; + nn_size = (size - remain_size_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + + int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn; + + for (int k = 0; k < maxk; k++) + { +#if C906 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr += 4; + } + + img0 += size * packn; +#else + vint8m1_t _val0 = vle8_v_i8m1(img0, vl); + vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl); + vint8m1_t _val2 = vle8_v_i8m1(img0 + packn * 2, vl); + vint8m1_t _val3 = vle8_v_i8m1(img0 + packn * 3, vl); + vsseg4e8_v_i8m1(tmpptr, _val0, _val1, _val2, _val3, vl); + + img0 += size * packn; + tmpptr += packn * 4; +#endif + } + } + } + + remain_size_start += nn_size << 2; + nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn; + + for (int k = 0; k < maxk; k++) + { +#if C906 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr += 2; + } + + img0 += size * packn; +#else + vint8m1_t _val0 = vle8_v_i8m1(img0, vl); + vint8m1_t _val1 = vle8_v_i8m1(img0 + packn, vl); + vsseg2e8_v_i8m1(tmpptr, _val0, _val1, vl); + + img0 += size * packn; + tmpptr += packn * 2; +#endif + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int8_t* img0 = (const int8_t*)bottom_im2col.channel(q) + i * packn; + + for (int k = 0; k < maxk; k++) + { + vint8m1_t _val = vle8_v_i8m1(img0, vl); + vse8_v_i8m1(tmpptr, _val, vl); + + img0 += size * packn; + tmpptr += packn; + } + } + } + } + + // TODO + int nn_outch = outch / packn; + int remain_outch_start = nn_outch * packn; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * packn; + + int8_t* outptr0 = top_blob.channel(p); + +#ifdef __clang__ + const int8_t* zeros = _zero_tmp; +#else + const int8_t zeros[packn] = {0}; +#endif // __clang__ + const int8_t* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 7 < size; i += 8) + { + const int8_t* tmpptr = tmp.channel(i / 8); + const int8_t* kptr0 = kernel.channel(p / packn); + + int nn = inch * maxk * packn; // inch always > 0 + + vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum2 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum3 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum4 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum5 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum6 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum7 = vle8_v_i8m1(biasptr, vl); + + for (int j = 0; j < nn; j++) + { + int8_t val0 = *tmpptr++; + int8_t val1 = *tmpptr++; + int8_t val2 = *tmpptr++; + int8_t val3 = *tmpptr++; + int8_t val4 = *tmpptr++; + int8_t val5 = *tmpptr++; + int8_t val6 = *tmpptr++; + int8_t val7 = *tmpptr++; + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl); + _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl); + _sum2 = vmacc_vx_i8m1(_sum2, val2, _w0, vl); + _sum3 = vmacc_vx_i8m1(_sum3, val3, _w0, vl); + _sum4 = vmacc_vx_i8m1(_sum4, val4, _w0, vl); + _sum5 = vmacc_vx_i8m1(_sum5, val5, _w0, vl); + _sum6 = vmacc_vx_i8m1(_sum6, val6, _w0, vl); + _sum7 = vmacc_vx_i8m1(_sum7, val7, _w0, vl); + + kptr0 += packn; + } + +#if C906 + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl); + vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl); + vsse8_v_i8m1(outptr0 + 2, top_blob.cstep * sizeof(int8_t), _sum2, vl); + vsse8_v_i8m1(outptr0 + 3, top_blob.cstep * sizeof(int8_t), _sum3, vl); + vsse8_v_i8m1(outptr0 + 4, top_blob.cstep * sizeof(int8_t), _sum4, vl); + vsse8_v_i8m1(outptr0 + 5, top_blob.cstep * sizeof(int8_t), _sum5, vl); + vsse8_v_i8m1(outptr0 + 6, top_blob.cstep * sizeof(int8_t), _sum6, vl); + vsse8_v_i8m1(outptr0 + 7, top_blob.cstep * sizeof(int8_t), _sum7, vl); +#else + vssseg8e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); +#endif + outptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4); + const int8_t* kptr0 = kernel.channel(p / packn); + + int nn = inch * maxk * packn; // inch always > 0 + + vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum2 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum3 = vle8_v_i8m1(biasptr, vl); + + for (int j = 0; j < nn; j++) + { + int8_t val0 = *tmpptr++; + int8_t val1 = *tmpptr++; + int8_t val2 = *tmpptr++; + int8_t val3 = *tmpptr++; + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl); + _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl); + _sum2 = vmacc_vx_i8m1(_sum2, val2, _w0, vl); + _sum3 = vmacc_vx_i8m1(_sum3, val3, _w0, vl); + + kptr0 += packn; + } + +#if C906 + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl); + vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl); + vsse8_v_i8m1(outptr0 + 2, top_blob.cstep * sizeof(int8_t), _sum2, vl); + vsse8_v_i8m1(outptr0 + 3, top_blob.cstep * sizeof(int8_t), _sum3, vl); +#else + vssseg4e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, _sum2, _sum3, vl); +#endif + outptr0 += 4; + } + for (; i + 1 < size; i += 2) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2); + const int8_t* kptr0 = kernel.channel(p / packn); + + int nn = inch * maxk * packn; // inch always > 0 + + vint8m1_t _sum0 = vle8_v_i8m1(biasptr, vl); + vint8m1_t _sum1 = vle8_v_i8m1(biasptr, vl); + + for (int j = 0; j < nn; j++) + { + int8_t val0 = *tmpptr++; + int8_t val1 = *tmpptr++; + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vx_i8m1(_sum0, val0, _w0, vl); + _sum1 = vmacc_vx_i8m1(_sum1, val1, _w0, vl); + + kptr0 += packn; + } + +#if C906 + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, vl); + vsse8_v_i8m1(outptr0 + 1, top_blob.cstep * sizeof(int8_t), _sum1, vl); +#else + vssseg2e8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum0, _sum1, vl); +#endif + outptr0 += 2; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2); + const int8_t* kptr0 = kernel.channel(p / packn); + + int nn = inch * maxk * packn; // inch always > 0 + + vint8m1_t _sum = vle8_v_i8m1(biasptr, vl); + + for (int j = 0; j < nn; j++) + { + int8_t val = *tmpptr++; + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum = vmacc_vx_i8m1(_sum, val, _w0, vl); + + kptr0 += packn; + } + + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), _sum, vl); + + outptr0 += 1; + } + } +#ifdef __clang__ + delete[] _zero_tmp; +#endif // __clang__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int8_t* outptr0 = top_blob.channel(p); + + const int8_t bias0 = bias ? bias[p] : 0; + + int i = 0; + for (; i + 7 < size; i += 8) + { + const int8_t* tmpptr = tmp.channel(i / 8); + const int8_t* kptr0 = kernel.channel(p / packn + p % packn); + + int nn = inch * maxk; // inch always > 0 + + int8_t sum0 = bias0; + int8_t sum1 = bias0; + int8_t sum2 = bias0; + int8_t sum3 = bias0; + int8_t sum4 = bias0; + int8_t sum5 = bias0; + int8_t sum6 = bias0; + int8_t sum7 = bias0; + + vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum2 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum3 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum4 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum5 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum6 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum7 = vmv_v_x_i8m1(0, vl); + + for (int j = 0; j < nn; j++) + { + vint8m1_t _val0; + vint8m1_t _val1; + vint8m1_t _val2; + vint8m1_t _val3; + vint8m1_t _val4; + vint8m1_t _val5; + vint8m1_t _val6; + vint8m1_t _val7; + vlseg8e8_v_i8m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl); + _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl); + _sum2 = vmacc_vv_i8m1(_sum2, _val2, _w0, vl); + _sum3 = vmacc_vv_i8m1(_sum3, _val3, _w0, vl); + _sum4 = vmacc_vv_i8m1(_sum4, _val4, _w0, vl); + _sum5 = vmacc_vv_i8m1(_sum5, _val5, _w0, vl); + _sum6 = vmacc_vv_i8m1(_sum6, _val6, _w0, vl); + _sum7 = vmacc_vv_i8m1(_sum7, _val7, _w0, vl); + tmpptr += packn * 8; + kptr0 += packn; + } + +#if C906 + // TODO + std::vector ss0(packn); + std::vector ss1(packn); + std::vector ss2(packn); + std::vector ss3(packn); + std::vector ss4(packn); + std::vector ss5(packn); + std::vector ss6(packn); + std::vector ss7(packn); + vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl); + vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl); + vse8_v_i8m1((int8_t*)ss2.data(), _sum2, vl); + vse8_v_i8m1((int8_t*)ss3.data(), _sum3, vl); + vse8_v_i8m1((int8_t*)ss4.data(), _sum4, vl); + vse8_v_i8m1((int8_t*)ss5.data(), _sum5, vl); + vse8_v_i8m1((int8_t*)ss6.data(), _sum6, vl); + vse8_v_i8m1((int8_t*)ss7.data(), _sum7, vl); + for (int i = 0; i < packn; i++) + { + sum0 += ss0[i]; + sum1 += ss1[i]; + sum2 += ss2[i]; + sum3 += ss3[i]; + sum4 += ss4[i]; + sum5 += ss5[i]; + sum6 += ss6[i]; + sum7 += ss7[i]; + } +#else + sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl)); + sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl)); + sum2 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum2, vfmv_s_f_f16m1(vint8m1_t(), sum2, vl), vl)); + sum3 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum3, vfmv_s_f_f16m1(vint8m1_t(), sum3, vl), vl)); + sum4 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum4, vfmv_s_f_f16m1(vint8m1_t(), sum4, vl), vl)); + sum5 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum5, vfmv_s_f_f16m1(vint8m1_t(), sum5, vl), vl)); + sum6 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum6, vfmv_s_f_f16m1(vint8m1_t(), sum6, vl), vl)); + sum7 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum7, vfmv_s_f_f16m1(vint8m1_t(), sum7, vl), vl)); +#endif + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + outptr0[4] = sum4; + outptr0[5] = sum5; + outptr0[6] = sum6; + outptr0[7] = sum7; + + outptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4); + const int8_t* kptr0 = kernel.channel(p / packn + p % packn); + + int nn = inch * maxk; // inch always > 0 + + int8_t sum0 = bias0; + int8_t sum1 = bias0; + int8_t sum2 = bias0; + int8_t sum3 = bias0; + + vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum2 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum3 = vmv_v_x_i8m1(0, vl); + + for (int j = 0; j < nn; j++) + { + vint8m1_t _val0; + vint8m1_t _val1; + vint8m1_t _val2; + vint8m1_t _val3; + + vlseg4e8_v_i8m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl); + _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl); + _sum2 = vmacc_vv_i8m1(_sum2, _val2, _w0, vl); + _sum3 = vmacc_vv_i8m1(_sum3, _val3, _w0, vl); + tmpptr += packn * 4; + kptr0 += packn; + } + +#if C906 + // TODO + std::vector ss0(packn); + std::vector ss1(packn); + std::vector ss2(packn); + std::vector ss3(packn); + vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl); + vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl); + vse8_v_i8m1((int8_t*)ss2.data(), _sum2, vl); + vse8_v_i8m1((int8_t*)ss3.data(), _sum3, vl); + for (int i = 0; i < packn; i++) + { + sum0 += ss0[i]; + sum1 += ss1[i]; + sum2 += ss2[i]; + sum3 += ss3[i]; + } +#else + sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl)); + sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl)); + sum2 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum2, vfmv_s_f_f16m1(vint8m1_t(), sum2, vl), vl)); + sum3 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum3, vfmv_s_f_f16m1(vint8m1_t(), sum3, vl), vl)); +#endif + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + + outptr0 += 4; + } + for (; i + 1 < size; i += 2) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2); + const int8_t* kptr0 = kernel.channel(p / packn + p % packn); + + int nn = inch * maxk; // inch always > 0 + + int8_t sum0 = bias0; + int8_t sum1 = bias0; + + vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl); + vint8m1_t _sum1 = vmv_v_x_i8m1(0, vl); + + for (int j = 0; j < nn; j++) + { + vint8m1_t _val0; + vint8m1_t _val1; + vlseg2e8_v_i8m1(&_val0, &_val1, tmpptr, vl); + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl); + _sum1 = vmacc_vv_i8m1(_sum1, _val1, _w0, vl); + tmpptr += packn * 2; + kptr0 += packn; + } + +#if C906 + // TODO + std::vector ss0(packn); + std::vector ss1(packn); + vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl); + vse8_v_i8m1((int8_t*)ss1.data(), _sum1, vl); + for (int i = 0; i < packn; i++) + { + sum0 += ss0[i]; + sum1 += ss1[i]; + } +#else + sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl)); + sum1 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum1, vfmv_s_f_f16m1(vint8m1_t(), sum1, vl), vl)); +#endif + + outptr0[0] = sum0; + outptr0[1] = sum1; + + outptr0 += 2; + } + for (; i < size; i++) + { + const int8_t* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2); + const int8_t* kptr0 = kernel.channel(p / packn + p % packn); + + int nn = inch * maxk; // inch always > 0 + + int8_t sum0 = bias0; + + vint8m1_t _sum0 = vmv_v_x_i8m1(0, vl); + + for (int j = 0; j < nn; j++) + { + vint8m1_t _val0 = vle8_v_i8m1(tmpptr, vl); + vint8m1_t _w0 = vle8_v_i8m1(kptr0, vl); + _sum0 = vmacc_vv_i8m1(_sum0, _val0, _w0, vl); + tmpptr += packn; + kptr0 += packn; + } + +#if C906 + // TODO + std::vector ss0(packn); + vse8_v_i8m1((int8_t*)ss0.data(), _sum0, vl); + for (int i = 0; i < packn; i++) + { + sum0 += ss0[i]; + } +#else + sum0 = vmv_x_s_i8m1_i8(vfredusum_vs_f16m1_f16m1(vint8m1_t(), _sum0, vfmv_s_f_f16m1(vint8m1_t(), sum0, vl), vl)); +#endif + + outptr0[0] = sum0; + + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_packnto1_int8_rvv(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int packn = csrr_vlenb(); + + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = pb-pa-maxk-inch/pa-outch/pb + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(packn * packn * maxk, inch / packn, outch / packn + outch % packn, (size_t)1u); + + int q = 0; + for (; q + (packn - 1) < outch; q += packn) + { + int8_t* g00 = kernel_tm.channel(q / packn); + + for (int p = 0; p + (packn - 1) < inch; p += packn) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < packn; i++) + { + for (int j = 0; j < packn; j++) + { + const float* k00 = kernel.channel(q + j).row(p + i); + + g00[0] = (int8_t)k00[k]; + + g00++; + } + } + } + } + } + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + + int8_t* g00 = kernel_tm.channel(q / packn + q % packn); + + for (int p = 0; p + (packn - 1) < inch; p += packn) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < packn; j++) + { + const float* k00 = k0.row(p + j); + + g00[0] = (int8_t)k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_packnto1_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + const int packn = csrr_vlenb(); + const size_t vl = vsetvl_e8m1(packn); + + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u * packn, packn, opt.workspace_allocator); + { + const int gap = (w * stride_h - outw * stride_w) * packn; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int8_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int8_t* sptr = img.row(dilation_h * u) + dilation_w * v * packn; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + vint8m1_t _val = vle8_v_i8m1(sptr, vl); + vse8_v_i8m1(ptr, _val, vl); + + sptr += stride_w * packn; + ptr += packn; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_packnto1_int8_rvv(bottom_im2col, top_blob, kernel, _bias, opt); +} From 57b961a529a26051f57d30400ebf3259419e938f Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 25 Dec 2023 16:22:00 +0800 Subject: [PATCH 04/10] innerproduct int8 riscv rvv Signed-off-by: Molly Sophia --- src/layer/riscv/innerproduct_riscv.cpp | 154 +++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 9 deletions(-) diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index c0d22817710a..d3a8c15cf978 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -1140,6 +1140,9 @@ int InnerProduct_riscv::create_pipeline_int8_riscv(const Option& opt) int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if __riscv_vector + const int packn = csrr_vlenb(); +#endif const int num_input = weight_data_size / num_output; int elembits = bottom_blob.elembits(); @@ -1178,8 +1181,46 @@ int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob for (int j = 0; j < outh; j++) { float* outptr = top_blob.row(j); +#if __riscv_vector + int nn_num_output = num_output / packn ? num_output / packn - 1 : 0; + int remain_num_output_start = nn_num_output * packn; + + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * packn; - for (int p = 0; p < num_output; p++) + size_t vl = vsetvl_e8m1(packn); + vint32m4_t _sum = vmv_v_x_i32m4(0, vl); + + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + for (int i = 0; i < num_input; i++) + { + vint16m2_t _kptr = vwcvt_x_x_v_i16m2(vlse8_v_i8m1(kptr, num_input, vl), vl); + _sum = vwmacc_vx_i32m4(_sum, *m, _kptr, vl); + m++; + kptr++; + } + + vfloat32m4_t _sumfp32; + if (bias_term) + _sumfp32 = vle32_v_f32m4((const float *)bias_data + p, vl); + else + _sumfp32 = vfmv_v_f_f32m4(0.f, vl); + + _sumfp32 = vfmacc_vv_f32m4(_sumfp32, vreinterpret_v_i32m4_f32m4(_sum), + vle32_v_f32m4((const float *)scale_in_data + p, vl), vl); + + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + vse32_v_f32m4((float*)outptr, _sumfp32, vl); + } +#else + int remain_num_output_start = 0; +#endif + + for (int p = remain_num_output_start; p < num_output; p++) { const signed char* kptr = weight_data_tm.row(p); const signed char* m = bottom_blob_int8_unpacked.row(j); @@ -1226,22 +1267,117 @@ int InnerProduct_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob if (out_elempack == 1) { +#if __riscv_vector + int nn_num_output = num_output / packn ? num_output / packn - 1 : 0; + int remain_num_output_start = nn_num_output * packn; + #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * packn; + + size_t vl = vsetvl_e8m1(packn); + vint32m4_t _sum = vmv_v_x_i32m4(0, vl); + + const signed char* w = weight_data_tm.row(p); + + const signed char* m = bottom_blob_int8_flattened; + + int n = num_input; + while (n > 0) + { + vint16m2_t _w = vwcvt_x_x_v_i16m2(vlse8_v_i8m1(w, num_input, vl), vl); + _sum = vwmacc_vx_i32m4(_sum, *m, _w, vl); + + m += 1; + w += 1; + n -= 1; + } + + vfloat32m4_t sumfp32; + if (bias_term) + sumfp32 = vle32_v_f32m4((const float *)bias_data + p, vl); + else + sumfp32 = vfmv_v_f_f32m4(0.f, vl); + + sumfp32 = vfmacc_vv_f32m4(sumfp32, vreinterpret_v_i32m4_f32m4(_sum), + vle32_v_f32m4((const float *)scale_in_data + p, vl), vl); + + sumfp32 = activation_ps(sumfp32, activation_type, activation_params, vl); + + vse32_v_f32m4((float*)top_blob + p, sumfp32, vl); + } +#else + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + const signed char* w0 = weight_data_tm.row(p); + const signed char* w1 = weight_data_tm.row(p + 1); + const signed char* w2 = weight_data_tm.row(p + 2); + const signed char* w3 = weight_data_tm.row(p + 3); + + const signed char* m = bottom_blob_int8_flattened; + + for (int i = 0; i < num_input; i++) + { + sum0 += *m * *w0; + sum1 += *m * *w1; + sum2 += *m * *w2; + sum3 += *m * *w3; + + m++; + w0++; + w1++; + w2++; + w3++; + } + + float sumfp32_0 = sum0 * scale_in_data[p]; + float sumfp32_1 = sum1 * scale_in_data[p + 1]; + float sumfp32_2 = sum2 * scale_in_data[p + 2]; + float sumfp32_3 = sum3 * scale_in_data[p + 3]; + + if (bias_term) + { + sumfp32_0 += bias_data[p]; + sumfp32_1 += bias_data[p + 1]; + sumfp32_2 += bias_data[p + 2]; + sumfp32_3 += bias_data[p + 3]; + } + + sumfp32_0 = activation_ss(sumfp32_0, activation_type, activation_params); + sumfp32_1 = activation_ss(sumfp32_1, activation_type, activation_params); + sumfp32_2 = activation_ss(sumfp32_2, activation_type, activation_params); + sumfp32_3 = activation_ss(sumfp32_3, activation_type, activation_params); + + top_blob[p] = sumfp32_0; + top_blob[p + 1] = sumfp32_1; + top_blob[p + 2] = sumfp32_2; + top_blob[p + 3] = sumfp32_3; + } +#endif // __riscv_vector + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) { int sum = 0; const signed char* kptr = weight_data_tm.row(p); const signed char* sptr = bottom_blob_int8_flattened; - int i = 0; - for (; i < num_input; i++) + for (int i = 0; i < num_input; i++) { - signed char val = sptr[0]; - - signed char w = kptr[0]; - - sum += val * w; + sum += *sptr * *kptr; sptr += 1; kptr += 1; From 2dacf8043f119bc9ee48b39afeb430034236e67b Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Thu, 8 Feb 2024 21:55:14 +0800 Subject: [PATCH 05/10] riscv quantize packing Signed-off-by: Molly Sophia --- src/layer/riscv/quantize_riscv.cpp | 688 +++++++++++++++++++++++++++++ src/layer/riscv/quantize_riscv.h | 32 ++ 2 files changed, 720 insertions(+) create mode 100644 src/layer/riscv/quantize_riscv.cpp create mode 100644 src/layer/riscv/quantize_riscv.h diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp new file mode 100644 index 000000000000..172c7d45ab3d --- /dev/null +++ b/src/layer/riscv/quantize_riscv.cpp @@ -0,0 +1,688 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_riscv.h" + +#include "riscv_usability.h" + +namespace ncnn { + +Quantize_riscv::Quantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_vector + int packn = csrr_vlenb() / 4; + int out_packn = packn * 4; + size_t vl = vsetvl_e32m4(packn); +#endif + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack == packn) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % out_packn == 0 ? out_packn : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % out_packn == 0 ? out_packn : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == out_packn) + { + if (scale_data_size == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 4); + const float* ptr1 = bottom_blob.row(i * 4 + 1); + const float* ptr2 = bottom_blob.row(i * 4 + 2); + const float* ptr3 = bottom_blob.row(i * 4 + 3); + signed char* outptr = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl); + vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl); + vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl); + _ptr0 = vfmul_vf_f32m4(_ptr0, scale_data[0], vl); + _ptr1 = vfmul_vf_f32m4(_ptr1, scale_data[0], vl); + _ptr2 = vfmul_vf_f32m4(_ptr2, scale_data[0], vl); + _ptr3 = vfmul_vf_f32m4(_ptr3, scale_data[0], vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vint8m1_t out1 = float2int8(_ptr1, vl); + vint8m1_t out2 = float2int8(_ptr2, vl); + vint8m1_t out3 = float2int8(_ptr3, vl); + vse8_v_i8m1(outptr, out0, vl); + vse8_v_i8m1(outptr + packn, out1, vl); + vse8_v_i8m1(outptr + 2 * packn, out2, vl); + vse8_v_i8m1(outptr + 3 * packn, out3, vl); + + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + outptr += out_packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 4); + const float* ptr1 = bottom_blob.row(i * 4 + 1); + const float* ptr2 = bottom_blob.row(i * 4 + 2); + const float* ptr3 = bottom_blob.row(i * 4 + 3); + signed char* outptr = top_blob.row(i); + vfloat32m4_t _scale0 = vle32_v_f32m4((const float*)scale_data + 4 * i * packn, vl); + vfloat32m4_t _scale1 = vle32_v_f32m4((const float*)scale_data + (4 * i + 1) * packn, vl); + vfloat32m4_t _scale2 = vle32_v_f32m4((const float*)scale_data + (4 * i + 2) * packn, vl); + vfloat32m4_t _scale3 = vle32_v_f32m4((const float*)scale_data + (4 * i + 3) * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl); + vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl); + vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl); + _ptr0 = vfmul_vv_f32m4(_ptr0, _scale0, vl); + _ptr1 = vfmul_vv_f32m4(_ptr1, _scale1, vl); + _ptr2 = vfmul_vv_f32m4(_ptr2, _scale2, vl); + _ptr3 = vfmul_vv_f32m4(_ptr3, _scale3, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vint8m1_t out1 = float2int8(_ptr1, vl); + vint8m1_t out2 = float2int8(_ptr2, vl); + vint8m1_t out3 = float2int8(_ptr3, vl); + vse8_v_i8m1(outptr, out0, vl); + vse8_v_i8m1(outptr + packn, out1, vl); + vse8_v_i8m1(outptr + 2 * packn, out2, vl); + vse8_v_i8m1(outptr + 3 * packn, out3, vl); + + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + outptr += out_packn; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * packn); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vsse8_v_i8m1(outptr0, top_blob.w * sizeof(int8_t), out0, vl); + + ptr0 += packn; + outptr0 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * packn); + + vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + _ptr0 = vfmul_vv_f32m4(_ptr0, _scale, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vsse8_v_i8m1(outptr0, top_blob.w * sizeof(int8_t), out0, vl); + + ptr0 += packn; + outptr0 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % out_packn == 0 ? out_packn : 1; + int outc = channels * elempack / out_elempack; + NCNN_LOGE("out_elempack:%d", out_elempack); + NCNN_LOGE("outc:%d", outc); + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == out_packn) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 4); + const float* ptr1 = bottom_blob.channel(q * 4 + 1); + const float* ptr2 = bottom_blob.channel(q * 4 + 2); + const float* ptr3 = bottom_blob.channel(q * 4 + 3); + signed char* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl); + vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl); + vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl); + _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl); + _ptr1 = vfmul_vf_f32m4(_ptr1, scale, vl); + _ptr2 = vfmul_vf_f32m4(_ptr2, scale, vl); + _ptr3 = vfmul_vf_f32m4(_ptr3, scale, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vint8m1_t out1 = float2int8(_ptr1, vl); + vint8m1_t out2 = float2int8(_ptr2, vl); + vint8m1_t out3 = float2int8(_ptr3, vl); + vse8_v_i8m1(outptr, out0, vl); + vse8_v_i8m1(outptr + packn, out1, vl); + vse8_v_i8m1(outptr + 2 * packn, out2, vl); + vse8_v_i8m1(outptr + 3 * packn, out3, vl); + + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + outptr += out_packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 4); + const float* ptr1 = bottom_blob.channel(q * 4 + 1); + const float* ptr2 = bottom_blob.channel(q * 4 + 2); + const float* ptr3 = bottom_blob.channel(q * 4 + 3); + signed char* outptr = top_blob.channel(q); + + vfloat32m4_t _scale0 = vle32_v_f32m4((const float*)scale_data + q * 4 * packn, vl); + vfloat32m4_t _scale1 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl); + vfloat32m4_t _scale2 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl); + vfloat32m4_t _scale3 = vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + vfloat32m4_t _ptr1 = vle32_v_f32m4(ptr1, vl); + vfloat32m4_t _ptr2 = vle32_v_f32m4(ptr2, vl); + vfloat32m4_t _ptr3 = vle32_v_f32m4(ptr3, vl); + _ptr0 = vfmul_vv_f32m4(_ptr0, _scale0, vl); + _ptr1 = vfmul_vv_f32m4(_ptr1, _scale1, vl); + _ptr2 = vfmul_vv_f32m4(_ptr2, _scale2, vl); + _ptr3 = vfmul_vv_f32m4(_ptr3, _scale3, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vint8m1_t out1 = float2int8(_ptr1, vl); + vint8m1_t out2 = float2int8(_ptr2, vl); + vint8m1_t out3 = float2int8(_ptr3, vl); + vse8_v_i8m1(outptr, out0, vl); + vse8_v_i8m1(outptr + packn, out1, vl); + vse8_v_i8m1(outptr + 2 * packn, out2, vl); + vse8_v_i8m1(outptr + 3 * packn, out3, vl); + + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + outptr += out_packn; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * packn); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + _ptr0 = vfmul_vf_f32m4(_ptr0, scale, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), out0, vl); + + ptr0 += packn; + outptr0 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * packn); + + vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + q * packn, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _ptr0 = vle32_v_f32m4(ptr0, vl); + _ptr0 = vfmul_vv_f32m4(_ptr0, _scale, vl); + vint8m1_t out0 = float2int8(_ptr0, vl); + vsse8_v_i8m1(outptr0, top_blob.cstep * sizeof(int8_t), out0, vl); + + ptr0 += packn; + outptr0 += 1; + } + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + + *outptr++ = float2int8(*ptr++ * scale); + } + } + else + { + const float* scaleptr = scale_data; +#if __riscv_vector + int num_nn = w / (packn * 8); + int remain_w_start = num_nn * packn * 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl); + vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl); + vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl); + vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl); + vfloat32m4_t _scale0 = vle32_v_f32m4(scaleptr, vl); + vfloat32m4_t _scale1 = vle32_v_f32m4(scaleptr + packn, vl); + vfloat32m4_t _scale2 = vle32_v_f32m4(scaleptr + 2 * packn, vl); + vfloat32m4_t _scale3 = vle32_v_f32m4(scaleptr + 3 * packn, vl); + vfloat32m4_t _scale4 = vle32_v_f32m4(scaleptr + 4 * packn, vl); + vfloat32m4_t _scale5 = vle32_v_f32m4(scaleptr + 5 * packn, vl); + vfloat32m4_t _scale6 = vle32_v_f32m4(scaleptr + 6 * packn, vl); + vfloat32m4_t _scale7 = vle32_v_f32m4(scaleptr + 7 * packn, vl); + _p0 = vfmul_vv_f32m4(_p0, _scale0, vl); + _p1 = vfmul_vv_f32m4(_p1, _scale1, vl); + _p2 = vfmul_vv_f32m4(_p2, _scale2, vl); + _p3 = vfmul_vv_f32m4(_p3, _scale3, vl); + _p4 = vfmul_vv_f32m4(_p4, _scale4, vl); + _p5 = vfmul_vv_f32m4(_p5, _scale5, vl); + _p6 = vfmul_vv_f32m4(_p6, _scale6, vl); + _p7 = vfmul_vv_f32m4(_p7, _scale7, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vint8m1_t _outp4 = float2int8(_p4, vl); + vint8m1_t _outp5 = float2int8(_p5, vl); + vint8m1_t _outp6 = float2int8(_p6, vl); + vint8m1_t _outp7 = float2int8(_p7, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + vse8_v_i8m1(outptr + 4 * packn, _outp4, vl); + vse8_v_i8m1(outptr + 5 * packn, _outp5, vl); + vse8_v_i8m1(outptr + 6 * packn, _outp6, vl); + vse8_v_i8m1(outptr + 7 * packn, _outp7, vl); + ptr += 8 * packn; + outptr += 8 * packn; + scaleptr += 8 * packn; + } + + num_nn = (w - remain_w_start) / (packn * 4); + remain_w_start += num_nn * packn * 4; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + vfloat32m4_t _scale0 = vle32_v_f32m4(scaleptr, vl); + vfloat32m4_t _scale1 = vle32_v_f32m4(scaleptr + packn, vl); + vfloat32m4_t _scale2 = vle32_v_f32m4(scaleptr + 2 * packn, vl); + vfloat32m4_t _scale3 = vle32_v_f32m4(scaleptr + 3 * packn, vl); + _p0 = vfmul_vv_f32m4(_p0, _scale0, vl); + _p1 = vfmul_vv_f32m4(_p1, _scale1, vl); + _p2 = vfmul_vv_f32m4(_p2, _scale2, vl); + _p3 = vfmul_vv_f32m4(_p3, _scale3, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + ptr += 4 * packn; + outptr += 4 * packn; + scaleptr += 4 * packn; + } +#else + int remain_w_start = 0; +#endif + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + *outptr++ = float2int8(*ptr++ * *scaleptr++); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* outptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + +#if __riscv_vector + int num_nn = w / (packn * 8); + int remain_w_start = num_nn * packn * 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl); + vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl); + vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl); + vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl); + _p0 = vfmul_vf_f32m4(_p0, scale, vl); + _p1 = vfmul_vf_f32m4(_p1, scale, vl); + _p2 = vfmul_vf_f32m4(_p2, scale, vl); + _p3 = vfmul_vf_f32m4(_p3, scale, vl); + _p4 = vfmul_vf_f32m4(_p4, scale, vl); + _p5 = vfmul_vf_f32m4(_p5, scale, vl); + _p6 = vfmul_vf_f32m4(_p6, scale, vl); + _p7 = vfmul_vf_f32m4(_p7, scale, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vint8m1_t _outp4 = float2int8(_p4, vl); + vint8m1_t _outp5 = float2int8(_p5, vl); + vint8m1_t _outp6 = float2int8(_p6, vl); + vint8m1_t _outp7 = float2int8(_p7, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + vse8_v_i8m1(outptr + 4 * packn, _outp4, vl); + vse8_v_i8m1(outptr + 5 * packn, _outp5, vl); + vse8_v_i8m1(outptr + 6 * packn, _outp6, vl); + vse8_v_i8m1(outptr + 7 * packn, _outp7, vl); + ptr += 8 * packn; + outptr += 8 * packn; + } + + num_nn = (w - remain_w_start) / (packn * 4); + remain_w_start += num_nn * packn * 4; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + _p0 = vfmul_vf_f32m4(_p0, scale, vl); + _p1 = vfmul_vf_f32m4(_p1, scale, vl); + _p2 = vfmul_vf_f32m4(_p2, scale, vl); + _p3 = vfmul_vf_f32m4(_p3, scale, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + ptr += 4 * packn; + outptr += 4 * packn; + } +#else + int remain_w_start = 0; +#endif + for (int j = remain_w_start; j < w; j++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + +#if __riscv_vector + int num_nn = w / (packn * 8); + int remain_w_start = num_nn * packn * 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + vfloat32m4_t _p4 = vle32_v_f32m4(ptr + 4 * packn, vl); + vfloat32m4_t _p5 = vle32_v_f32m4(ptr + 5 * packn, vl); + vfloat32m4_t _p6 = vle32_v_f32m4(ptr + 6 * packn, vl); + vfloat32m4_t _p7 = vle32_v_f32m4(ptr + 7 * packn, vl); + _p0 = vfmul_vf_f32m4(_p0, scale, vl); + _p1 = vfmul_vf_f32m4(_p1, scale, vl); + _p2 = vfmul_vf_f32m4(_p2, scale, vl); + _p3 = vfmul_vf_f32m4(_p3, scale, vl); + _p4 = vfmul_vf_f32m4(_p4, scale, vl); + _p5 = vfmul_vf_f32m4(_p5, scale, vl); + _p6 = vfmul_vf_f32m4(_p6, scale, vl); + _p7 = vfmul_vf_f32m4(_p7, scale, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vint8m1_t _outp4 = float2int8(_p4, vl); + vint8m1_t _outp5 = float2int8(_p5, vl); + vint8m1_t _outp6 = float2int8(_p6, vl); + vint8m1_t _outp7 = float2int8(_p7, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + vse8_v_i8m1(outptr + 4 * packn, _outp4, vl); + vse8_v_i8m1(outptr + 5 * packn, _outp5, vl); + vse8_v_i8m1(outptr + 6 * packn, _outp6, vl); + vse8_v_i8m1(outptr + 7 * packn, _outp7, vl); + ptr += 8 * packn; + outptr += 8 * packn; + } + + num_nn = (w - remain_w_start) / (packn * 4); + remain_w_start += num_nn * packn * 4; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p1 = vle32_v_f32m4(ptr + packn, vl); + vfloat32m4_t _p2 = vle32_v_f32m4(ptr + 2 * packn, vl); + vfloat32m4_t _p3 = vle32_v_f32m4(ptr + 3 * packn, vl); + _p0 = vfmul_vf_f32m4(_p0, scale, vl); + _p1 = vfmul_vf_f32m4(_p1, scale, vl); + _p2 = vfmul_vf_f32m4(_p2, scale, vl); + _p3 = vfmul_vf_f32m4(_p3, scale, vl); + vint8m1_t _outp0 = float2int8(_p0, vl); + vint8m1_t _outp1 = float2int8(_p1, vl); + vint8m1_t _outp2 = float2int8(_p2, vl); + vint8m1_t _outp3 = float2int8(_p3, vl); + vse8_v_i8m1(outptr, _outp0, vl); + vse8_v_i8m1(outptr + packn, _outp1, vl); + vse8_v_i8m1(outptr + 2 * packn, _outp2, vl); + vse8_v_i8m1(outptr + 3 * packn, _outp3, vl); + ptr += 4 * packn; + outptr += 4 * packn; + } +#else + int remain_w_start = 0; +#endif + for (int i = remain_w_start; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h new file mode 100644 index 000000000000..c91c93b17bfa --- /dev/null +++ b/src/layer/riscv/quantize_riscv.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_RISCV_H +#define LAYER_QUANTIZE_RISCV_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_riscv : virtual public Quantize +{ +public: + Quantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_RISCV_H From 50425e16a0e0949d95418a0430d47f2dea71d4c8 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 9 Feb 2024 12:12:28 +0800 Subject: [PATCH 06/10] riscv requantize packing Signed-off-by: Molly Sophia --- src/layer/riscv/requantize_riscv.cpp | 683 +++++++++++++++++++++++++++ src/layer/riscv/requantize_riscv.h | 32 ++ src/layer/riscv/riscv_usability.h | 51 ++ tests/test_dequantize.cpp | 4 + tests/test_requantize.cpp | 4 + tests/testutil.h | 4 + 6 files changed, 778 insertions(+) create mode 100644 src/layer/riscv/requantize_riscv.cpp create mode 100644 src/layer/riscv/requantize_riscv.h diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp new file mode 100644 index 000000000000..220087691d54 --- /dev/null +++ b/src/layer/riscv/requantize_riscv.cpp @@ -0,0 +1,683 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_riscv.h" + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +Requantize_riscv::Requantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_vector + int packn = csrr_vlenb(); + size_t vl = vsetvl_e32m4(packn); +#endif + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack != packn && elempack != 1) + { + Mat bottom_blob_unpacked; + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt); + return forward(bottom_blob_unpacked, top_blob, opt); + } + + if (elempack == packn) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vf_f32m4(_v, scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vf_f32m4(_v, scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vv_f32m4(_v, _scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vv_f32m4(_v, _scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + signed char* ptr = (signed char*)top_blob + i * packn; + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vv_f32m4(_v, scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + + intptr += packn; + ptr += packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); + vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); + vfloat32m4_t bias = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + i * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + + intptr += packn; + ptr += packn; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // if (activation_type == 1) + // { + // requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + // return 0; + // } + + // if (activation_type == 2 && activation_params[0] > 0.f) + // { + // requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + // return 0; + // } + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + q * packn, vl); + vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + q * packn, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmul_vv_f32m4(_v, scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + + intptr += packn; + ptr += packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m4_t scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m4(scale_in_data[0], vl) : vle32_v_f32m4((const float*)scale_in_data + q * packn, vl); + vfloat32m4_t scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m4(scale_out_data[0], vl) : vle32_v_f32m4((const float*)scale_out_data + q * packn, vl); + vfloat32m4_t bias = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + q * packn, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl); + vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); + vse8_v_i8m1(ptr, _out, vl); + + intptr += packn; + ptr += packn; + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + signed char* ptr = top_blob; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in + bias; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + +#if __riscv_vector + int num_nn = size / (packn * 2); + int remain_i_start = num_nn * packn * 2; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_nn; i++) + { + vfloat32m4_t _p0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + vfloat32m4_t _p1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); + _p0 = activation_ps(vfmul_vf_f32m4(_p0, scale_in, vl), activation_type, activation_params, vl); + _p1 = activation_ps(vfmul_vf_f32m4(_p1, scale_in, vl), activation_type, activation_params, vl); + vint8m1_t _outp0 = float2int8(vfmul_vf_f32m4(_p0, scale_out, vl), vl); + vint8m1_t _outp1 = float2int8(vfmul_vf_f32m4(_p1, scale_out, vl), vl); + vse8_v_i8m1(ptr, _outp0, vl); + vse8_v_i8m1(ptr + packn, _outp1, vl); + ptr += packn * 2; + intptr += packn * 2; + } +#else + int remain_i_start = 0; +#endif + for (int i = remain_i_start; i < size; i++) + { + float v = *intptr * scale_in; + *ptr = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + intptr++; + ptr++; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h new file mode 100644 index 000000000000..265c54a6be82 --- /dev/null +++ b/src/layer/riscv/requantize_riscv.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_RISCV_H +#define LAYER_REQUANTIZE_RISCV_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_riscv : virtual public Requantize +{ +public: + Requantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_RISCV_H diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index e2824646f871..1e1ba95ae80f 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -50,6 +50,26 @@ static inline int csrr_vlenb() return a; } +static inline int fcsr_frrm() +{ + int a = 0; + asm volatile("frrm %0" + : "=r"(a) + : + : "memory"); + return a; +} + +static inline int fcsr_fsrm(int frm) +{ + int a = 0; + asm volatile("fsrm %0, %1" + : "=r"(a) + : "r"(frm) + : "memory"); + return a; +} + static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; @@ -615,5 +635,36 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, _r3h = vle32_v_f32m1(ptr + 7 * 4, vl); } #endif +#if NCNN_INT8 +#if __riscv_vector +static inline vint8m1_t float2int8(vfloat32m4_t _v, size_t vl) +{ + int a = fcsr_fsrm(4); + vint16m2_t _vi16 = vfncvt_x_f_w_i16m2(_v, vl); + fcsr_fsrm(a); + _vi16 = vmax_vx_i16m2(_vi16, -127, vl); + _vi16 = vmin_vx_i16m2(_vi16, 127, vl); + return vncvt_x_x_w_i8m1(_vi16, vl); +} + +static inline vint8m2_t float2int8(vfloat32m8_t _v, size_t vl) +{ + int a = fcsr_fsrm(4); + vint16m4_t _vi16 = vfncvt_x_f_w_i16m4(_v, vl); + fcsr_fsrm(a); + _vi16 = vmax_vx_i16m4(_vi16, -127, vl); + _vi16 = vmin_vx_i16m4(_vi16, 127, vl); + return vncvt_x_x_w_i8m2(_vi16, vl); +} +#endif // __riscv_vector + +static inline signed char float2int8(float v) +{ + int int32 = round(v); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} +#endif // NCNN_INT8 #endif // RISCV_USABILITY_H diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp index ca05059fa450..9bb76b786856 100644 --- a/tests/test_dequantize.cpp +++ b/tests/test_dequantize.cpp @@ -46,7 +46,11 @@ static int test_dequantize_pack8(const ncnn::Mat& a, int scale_data_size, int bi if (bias_data_size) weights[1] = RandomMat(bias_data_size); +#if NCNN_RVV + int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB; +#else int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8; +#endif int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag); if (ret != 0) { diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp index 1032d529ea67..017b0645a872 100644 --- a/tests/test_requantize.cpp +++ b/tests/test_requantize.cpp @@ -79,7 +79,11 @@ static int test_requantize_pack8(const ncnn::Mat& a, int scale_in_data_size, int Randomize(weights[0], 0.0001, 0.001); Randomize(weights[1], 10, 100); +#if NCNN_RVV + int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB; +#else int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8; +#endif int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag); if (ret != 0) { diff --git a/tests/testutil.h b/tests/testutil.h index 12f9d0daa654..614b294044f1 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -28,6 +28,10 @@ #define TEST_LAYER_DISABLE_GPU_TESTING (1 << 2) #define TEST_LAYER_ENABLE_FORCE_INPUT_PACK8 (1 << 3) +#if NCNN_RVV +#define TEST_LAYER_ENABLE_FORCE_INPUT_PACKVLENB (1 << 4) +#endif // NCNN_RVV + void SRAND(int seed); uint64_t RAND(); From cc2439714791a67a4150141e885308ddd6b5e303 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 9 Feb 2024 15:19:19 +0800 Subject: [PATCH 07/10] riscv dequantize packing Signed-off-by: Molly Sophia --- src/layer/riscv/dequantize_riscv.cpp | 565 +++++++++++++++++++++++++++ src/layer/riscv/dequantize_riscv.h | 32 ++ 2 files changed, 597 insertions(+) create mode 100644 src/layer/riscv/dequantize_riscv.cpp create mode 100644 src/layer/riscv/dequantize_riscv.h diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp new file mode 100644 index 000000000000..9a172a414052 --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.cpp @@ -0,0 +1,565 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_riscv.h" + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +Dequantize_riscv::Dequantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif +} + +int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_vector + int packn = csrr_vlenb() / 4; + int in_packn = packn * 4; + size_t vl = vsetvl_e32m4(packn); +#endif + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack != in_packn && elempack != 1) + { + Mat bottom_blob_unpacked; + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt); + return forward(bottom_blob_unpacked, top_blob, opt); + } + + if (elempack == in_packn) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * 4; + + top_blob.create(outw, (size_t)4u * packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmul_vf_f32m4(_v, scale, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vf_f32m4(_v, scale, _bias, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vf_f32m4(_v, scale, _bias, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmul_vv_f32m4(_v, _scale, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + else if (bias_data_size == 1) + { + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * packn; + float* ptr = (float*)top_blob + i * packn; + + vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); + vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl); + vse32_v_f32m4(ptr, _v, vl); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * 4; + + top_blob.create(w, outh, (size_t)4u * packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 4); + float* ptr1 = top_blob.row(i * 4 + 1); + float* ptr2 = top_blob.row(i * 4 + 2); + float* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4) * packn, vl); + vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 1) * packn, vl); + vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 2) * packn, vl); + vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 3) * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); + vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); + vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); + _v0 = vfmul_vv_f32m4(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m4(_v1, _scale1, vl); + _v2 = vfmul_vv_f32m4(_v2, _scale2, vl); + _v3 = vfmul_vv_f32m4(_v3, _scale3, vl); + vse32_v_f32m4(ptr0, _v0, vl); + vse32_v_f32m4(ptr1, _v1, vl); + vse32_v_f32m4(ptr2, _v2, vl); + vse32_v_f32m4(ptr3, _v3, vl); + + intptr += in_packn; + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 4); + float* ptr1 = top_blob.row(i * 4 + 1); + float* ptr2 = top_blob.row(i * 4 + 2); + float* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4) * packn, vl); + vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 1) * packn, vl); + vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 2) * packn, vl); + vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (i * 4 + 3) * packn, vl); + vfloat32m4_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4) * packn, vl); + vfloat32m4_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 1) * packn, vl); + vfloat32m4_t _bias2 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 2) * packn, vl); + vfloat32m4_t _bias3 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (i * 4 + 3) * packn, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); + vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); + vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); + _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl); + _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl); + _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl); + _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl); + vse32_v_f32m4(ptr0, _v0, vl); + vse32_v_f32m4(ptr1, _v1, vl); + vse32_v_f32m4(ptr2, _v2, vl); + vse32_v_f32m4(ptr3, _v3, vl); + + intptr += in_packn; + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * 4; + + top_blob.create(w, h, outc, (size_t)4u * packn, packn, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 4); + float* ptr1 = top_blob.channel(q * 4 + 1); + float* ptr2 = top_blob.channel(q * 4 + 2); + float* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4) * packn, vl); + vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl); + vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl); + vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); + vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); + vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); + _v0 = vfmul_vv_f32m4(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m4(_v1, _scale1, vl); + _v2 = vfmul_vv_f32m4(_v2, _scale2, vl); + _v3 = vfmul_vv_f32m4(_v3, _scale3, vl); + vse32_v_f32m4(ptr0, _v0, vl); + vse32_v_f32m4(ptr1, _v1, vl); + vse32_v_f32m4(ptr2, _v2, vl); + vse32_v_f32m4(ptr3, _v3, vl); + + intptr += in_packn; + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 4); + float* ptr1 = top_blob.channel(q * 4 + 1); + float* ptr2 = top_blob.channel(q * 4 + 2); + float* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m4_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4) * packn, vl); + vfloat32m4_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 1) * packn, vl); + vfloat32m4_t _scale2 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 2) * packn, vl); + vfloat32m4_t _scale3 = scale_data_size == 1 ? vfmv_v_f_f32m4(scale_data[0], vl) : vle32_v_f32m4((const float*)scale_data + (q * 4 + 3) * packn, vl); + vfloat32m4_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4) * packn, vl); + vfloat32m4_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 1) * packn, vl); + vfloat32m4_t _bias2 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 2) * packn, vl); + vfloat32m4_t _bias3 = bias_data_size == 1 ? vfmv_v_f_f32m4(bias_data[0], vl) : vle32_v_f32m4((const float*)bias_data + (q * 4 + 3) * packn, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m4_t _v0 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); + vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); + vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); + _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl); + _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl); + _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl); + _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl); + vse32_v_f32m4(ptr0, _v0, vl); + vse32_v_f32m4(ptr1, _v1, vl); + vse32_v_f32m4(ptr2, _v2, vl); + vse32_v_f32m4(ptr3, _v3, vl); + + intptr += in_packn; + ptr0 += packn; + ptr1 += packn; + ptr2 += packn; + ptr3 += packn; + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + float* ptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias_data[i]; + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i]; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + int j = 0; +#if __riscv_vector + for (; j + packn < w; j += packn) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmul_vf_f32m4(_v, scale, vl); + vse32_v_f32m4(ptr, _v, vl); + + intptr += packn; + ptr += packn; + } +#endif // __riscv_vector + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + int j = 0; +#if __riscv_vector + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias, vl); + for (; j + packn < w; j += packn) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vf_f32m4(_bias, scale, _v, vl); + vse32_v_f32m4(ptr, _v, vl); + + intptr += packn; + ptr += packn; + } +#endif // __riscv_vector + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __riscv_vector + for (; i + packn < size; i += packn) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmul_vf_f32m4(_v, scale, vl); + vse32_v_f32m4(ptr, _v, vl); + + intptr += packn; + ptr += packn; + } +#endif // __riscv_vector + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + int i = 0; +#if __riscv_vector + vfloat32m4_t _bias = vfmv_v_f_f32m4(bias, vl); + for (; i + packn < size; i += packn) + { + vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); + _v = vfmacc_vf_f32m4(_bias, scale, _v, vl); + vse32_v_f32m4(ptr, _v, vl); + + intptr += packn; + ptr += packn; + } +#endif // __riscv_vector + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h new file mode 100644 index 000000000000..f91feb9cb143 --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_RISCV_H +#define LAYER_DEQUANTIZE_RISCV_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_riscv : virtual public Dequantize +{ +public: + Dequantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_RISCV_H From 4936e808e99426d3cbb55b542a1de5246b8b1c5e Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Thu, 15 Feb 2024 18:45:45 +0800 Subject: [PATCH 08/10] fix riscv requantize/dequantize packing Signed-off-by: Molly Sophia --- src/layer/riscv/dequantize_riscv.cpp | 24 ++++++++++++------------ src/layer/riscv/quantize_riscv.cpp | 2 -- src/layer/riscv/requantize_riscv.cpp | 20 ++++++++++---------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp index 9a172a414052..27662b4f98e1 100644 --- a/src/layer/riscv/dequantize_riscv.cpp +++ b/src/layer/riscv/dequantize_riscv.cpp @@ -83,7 +83,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio float* ptr = (float*)top_blob + i * packn; vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = vfmacc_vf_f32m4(_v, scale, _bias, vl); + _v = vfmacc_vf_f32m4(_bias, scale, _v, vl); vse32_v_f32m4(ptr, _v, vl); } } @@ -97,7 +97,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = vfmacc_vf_f32m4(_v, scale, _bias, vl); + _v = vfmacc_vf_f32m4(_bias, scale, _v, vl); vse32_v_f32m4(ptr, _v, vl); } } @@ -130,7 +130,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl); + _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl); vse32_v_f32m4(ptr, _v, vl); } } @@ -145,7 +145,7 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale = vle32_v_f32m4((const float*)scale_data + i * packn, vl); vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = vfmacc_vv_f32m4(_v, _scale, _bias, vl); + _v = vfmacc_vv_f32m4(_bias, _scale, _v, vl); vse32_v_f32m4(ptr, _v, vl); } } @@ -227,10 +227,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); - _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl); - _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl); - _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl); - _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl); + _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl); + _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl); + _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl); + _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl); vse32_v_f32m4(ptr0, _v0, vl); vse32_v_f32m4(ptr1, _v1, vl); vse32_v_f32m4(ptr2, _v2, vl); @@ -323,10 +323,10 @@ int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _v1 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn, vl), vl); vfloat32m4_t _v2 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 2, vl), vl); vfloat32m4_t _v3 = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr + packn * 3, vl), vl); - _v0 = vfmacc_vv_f32m4(_v0, _scale0, _bias0, vl); - _v1 = vfmacc_vv_f32m4(_v1, _scale1, _bias1, vl); - _v2 = vfmacc_vv_f32m4(_v2, _scale2, _bias2, vl); - _v3 = vfmacc_vv_f32m4(_v3, _scale3, _bias3, vl); + _v0 = vfmacc_vv_f32m4(_bias0, _scale0, _v0, vl); + _v1 = vfmacc_vv_f32m4(_bias1, _scale1, _v1, vl); + _v2 = vfmacc_vv_f32m4(_bias2, _scale2, _v2, vl); + _v3 = vfmacc_vv_f32m4(_bias3, _scale3, _v3, vl); vse32_v_f32m4(ptr0, _v0, vl); vse32_v_f32m4(ptr1, _v1, vl); vse32_v_f32m4(ptr2, _v2, vl); diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index 172c7d45ab3d..ed2a331dfd9a 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -231,8 +231,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int size = w * h; int out_elempack = opt.use_packing_layout && channels * elempack % out_packn == 0 ? out_packn : 1; int outc = channels * elempack / out_elempack; - NCNN_LOGE("out_elempack:%d", out_elempack); - NCNN_LOGE("outc:%d", outc); top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); if (top_blob.empty()) diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp index 220087691d54..f2d3db2d1ff7 100644 --- a/src/layer/riscv/requantize_riscv.cpp +++ b/src/layer/riscv/requantize_riscv.cpp @@ -83,7 +83,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio signed char* ptr = (signed char*)top_blob + i * packn; vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -98,7 +98,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -137,7 +137,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -154,7 +154,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_v, scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -193,7 +193,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -210,7 +210,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -249,7 +249,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -267,7 +267,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, _scale_in, _bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(_bias, _scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -322,7 +322,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio for (int j = 0; j < w; j++) { vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); @@ -394,7 +394,7 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio for (int i = 0; i < size; i++) { vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vv_f32m4(_v, scale_in, bias, vl), activation_type, activation_params, vl); + _v = activation_ps(vfmacc_vv_f32m4(bias, scale_in, _v, vl), activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); From 5dde273febd36c8894ecdefa0324051b8aab9f89 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sat, 17 Feb 2024 15:02:16 +0800 Subject: [PATCH 09/10] riscv convdw int8 packing rvv Signed-off-by: Molly Sophia --- .../riscv/convolutiondepthwise_riscv.cpp | 388 +++++++++++++++++- src/layer/riscv/convolutiondepthwise_riscv.h | 7 + 2 files changed, 383 insertions(+), 12 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index d913fe7e1d59..d35121dc3c66 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -60,8 +60,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - // TODO implement int8 - return 0; + return create_pipeline_int8_riscv(opt); } #endif @@ -238,14 +237,11 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + int packn = 1; +#if __riscv_vector + packn = csrr_vlenb(); +#endif Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; if (bottom_blob_unpacked.elembits() == 16) @@ -253,12 +249,15 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c Option opt_pack1 = opt; opt_pack1.blob_allocator = opt.workspace_allocator; + if (!opt.use_packing_layout || packn == 1 || bottom_blob.elempack == 1) + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + else + convert_packing(bottom_blob, bottom_blob_unpacked, packn / 4, opt_pack1); + cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); } - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8_riscv(bottom_blob_unpacked_fp32, top_blob, opt); } #endif @@ -1153,4 +1152,369 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int ConvolutionDepthWise_riscv::create_pipeline_int8_riscv(const Option& opt) +{ +#if __riscv_vector + const int packn = csrr_vlenb(); + size_t vl = vsetvl_e8m1(packn); +#endif + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + elempack = channels % packn == 0 ? packn : 1; + } + + if (elempack == packn) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, packn, opt); + } +#endif // __riscv_vector + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + scale_in_data.create(group); + for (int g = 0; g < group; g++) + { + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_in_data[g] = scale_in; + } + + return 0; + } + + // group convolution + Option opt_unpack = opt; + opt_unpack.use_packing_layout = false; + create_group_ops(opt_unpack); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_riscv::forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_vector + const int packn = csrr_vlenb(); + size_t vl = vsetvl_e8m1(packn); +#endif + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + + int elembits = bottom_blob.elembits(); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + const int channels_g = channels * elempack / group; + + Mat scales(channels * elempack); + { + float* ps = scales; + for (int g = 0; g < group; g++) + { + float scale = bottom_blob_int8_scales[g]; + for (int q = 0; q < channels_g; q++) + { + *ps++ = scale; + } + } + } + + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + channels = bottom_blob_bordered.c; + elempack = bottom_blob_bordered.elempack; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + // depth-wise + if (channels * elempack == group && group == num_output) + { + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_vector + bool use_int8_requantize = int8_scale_term > 100; + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_vector + if (elempack == packn) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * packn; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vint32m4_t _sum = vmv_v_x_i32m4(0, vl); + + const signed char* sptr = m.row(i * stride_h) + j * stride_w * packn; + + for (int k = 0; k < maxk; k++) + { + vint16m2_t _val = vwcvt_x_x_v_i16m2(vle8_v_i8m1(sptr + space_ofs[k] * packn, vl), vl); + vint16m2_t _w = vwcvt_x_x_v_i16m2(vle8_v_i8m1(kptr + k * packn, vl), vl); + + _sum = vwmacc_vv_i32m4(_sum, _val, _w, vl); + } + + vfloat32m4_t _scale_in = vle32_v_f32m4((const float*)scale_in_data + g * packn, vl); + vfloat32m4_t _sumfp32 = bias_term ? vle32_v_f32m4((const float*)bias_data + g * packn, vl) + : vfmv_v_f_f32m4(0.f, vl); + + _sumfp32 = vfmacc_vv_f32m4(_sumfp32, _scale_in, vfcvt_f_x_v_f32m4(_sum, vl), vl); + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + if (use_int8_requantize) + { + // requantize and relu + vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)top_blob_int8_scales + g * packn, vl); + _sumfp32 = vfmul_vv_f32m4(_sumfp32, _scale_out, vl); + vint8m1_t _sum8 = float2int8(_sumfp32, vl); + + vse8_v_i8m1(outptr_s8, _sum8, vl); + outptr_s8 += packn; + } + else + { + // dequantize and relu + vse32_v_f32m4(outptr_f32, _sumfp32, vl); + outptr_f32 += packn; + } + } + } + } + } + } +#endif // __riscv_vector + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float sumfp32 = sum * scale_in_data[g]; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } + + return 0; + } + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if 0 + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % packn == 0 ? packn : 1; + else + out_elempack = num_output % (packn / 4) == 0 ? (packn / 4) : 1; + } +#endif // __riscv_vector + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if 0 + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + if (use_int8_requantize) + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + else + out_g_elempack = num_output_g % (packn / 4) == 0 ? (packn / 4) : 1; + } +#endif // __riscv_vector + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} +#endif // NCNN_INT8 } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index f9503975296d..944a0ac58727 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -38,6 +38,10 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8_riscv(const Option& opt); + int forward_int8_riscv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif public: Layer* activation; @@ -47,6 +51,9 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise // fp16 Mat bias_data_fp16; +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn From 830052acc726e0058f65d5dc1b39db5b52a7d11d Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 1 Mar 2024 15:44:12 +0800 Subject: [PATCH 10/10] fixup! riscv requantize packing --- src/layer/riscv/requantize_riscv.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp index f2d3db2d1ff7..833756e248a3 100644 --- a/src/layer/riscv/requantize_riscv.cpp +++ b/src/layer/riscv/requantize_riscv.cpp @@ -83,7 +83,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio signed char* ptr = (signed char*)top_blob + i * packn; vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + _v = vfmul_vf_f32m4(_v, scale_in, vl); + _v = vfadd_vv_f32m4(_v, _bias, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -98,7 +101,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + _v = vfmul_vf_f32m4(_v, scale_in, vl); + _v = vfadd_vv_f32m4(_v, _bias, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vf_f32m4(_v, scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -137,7 +143,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _scale_out = vle32_v_f32m4((const float*)scale_out_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + _v = vfmul_vf_f32m4(_v, scale_in, vl); + _v = vfadd_vv_f32m4(_v, _bias, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); } @@ -154,7 +163,10 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio vfloat32m4_t _bias = vle32_v_f32m4((const float*)bias_data + i * packn, vl); vfloat32m4_t _v = vfcvt_f_x_v_f32m4(vle32_v_i32m4(intptr, vl), vl); - _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + // _v = activation_ps(vfmacc_vf_f32m4(_bias, scale_in, _v, vl), activation_type, activation_params, vl); + _v = vfmul_vf_f32m4(_v, scale_in, vl); + _v = vfadd_vv_f32m4(_v, _bias, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); vint8m1_t _out = float2int8(vfmul_vv_f32m4(_v, _scale_out, vl), vl); vse8_v_i8m1(ptr, _out, vl); }