diff --git a/_modules/fbgemm_gpu/docs/examples.html b/_modules/fbgemm_gpu/docs/examples.html index 49e32b5ff..50160594a 100644 --- a/_modules/fbgemm_gpu/docs/examples.html +++ b/_modules/fbgemm_gpu/docs/examples.html @@ -292,6 +292,7 @@
  • Combine Input Operators
  • Layout Transformation Operators
  • Embedding Operators
  • +
  • Experimental Operators
  • FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    diff --git a/namespacefbgemm.html b/namespacefbgemm.html new file mode 100644 index 000000000..bb16814a5 --- /dev/null +++ b/namespacefbgemm.html @@ -0,0 +1,3276 @@ + + + + + + + +fbgemm_gpu: fbgemm Namespace Reference + + + + + + + + + + + +
    +
    + + + + + + +
    +
    fbgemm_gpu +
    +
    +
    + + + + + + + + +
    +
    + + +
    +
    +
    +
    +
    +
    Loading...
    +
    Searching...
    +
    No Matches
    +
    +
    +
    +
    + +
    +
    + +
    fbgemm Namespace Reference
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Classes

    struct  BCSRMatrix
     
    struct  block_type_t
     
    struct  BlockingFactors
     
    class  CodeCache
     
    class  CodeGenBase
     
    class  CompressedSparseColumn
     
    struct  conv_param_t
     
    class  DoNothing
     
    class  DoSConvOnInpBuffer
     
    class  DoSpmdmOnInpBuffer
     
    class  ExecuteKernel
     
    class  ExecuteKernel< packingAMatrix, PackBMatrix< int8_t, typename packingAMatrix::accType >, cT, processOutputType >
     
    struct  is_8bit
     
    class  memCopy
     
    class  PackAMatrix
     
    class  PackAWithIm2Col
     
    class  PackAWithQuantRowOffset
     
    class  PackAWithRowOffset
     
    class  PackBMatrix
     
    class  PackedGemmMatrixB
     
    struct  PackingTraits
     
    class  PackMatrix
     
    class  PackWeightMatrixForGConv
     
    class  PackWeightsForConv
     
    class  ReluOutput
     
    struct  requantizationForFloatParams_t
     
    struct  RequantizationParams
     
    struct  requantizationParams_t
     
    class  ReQuantizeForFloat
     
    class  ReQuantizeOutput
     
    class  ScaleOP
     
    struct  simd_info
     
    class  SparseAdaGradSignature
     
    struct  TensorQuantizationParams
     
    struct  thread_type_t
     
    + + + +

    +Enumerations

    enum class  impl_type_t
     
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Functions

    template<typename packingAMatrix , typename packingBMatrix , typename cT , typename processOutputType >
    void fbgemmPacked (PackMatrix< packingAMatrix, typename packingAMatrix::inpType, typename packingAMatrix::accType > &packA, PackMatrix< packingBMatrix, typename packingBMatrix::inpType, typename packingBMatrix::accType > &packB, cT *C, std::int32_t *C_buffer, std::uint32_t ldc, const processOutputType &outProcess, int thread_id, int num_threads, const BlockingFactors *blocking_params=nullptr)
     
    template<typename packed_W , typename outType , bool FUSE_RELU, QuantizationGranularity Q_GRAN, int SPATIAL_DIM = 2, typename BIAS_TYPE = std::int32_t>
    void fbgemmGroupwiseConv (const conv_param_t< SPATIAL_DIM > &conv_param, const std::uint8_t *activations, std::int32_t a_zero_point, std::int32_t *rowOffsetBuf, packed_W &packed_weights, outType *out, std::int32_t *outBuffer, const ReQuantizeOutput< FUSE_RELU, Q_GRAN, BIAS_TYPE > &outProcess, int thread_id, int num_threads)
     
    template<int SPATIAL_DIM = 2>
    int rowOffsetBufferSizeGConv (const conv_param_t< SPATIAL_DIM > &conv_param)
     
    template<typename processOutputType , int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
    int fbgemmConv (const conv_param_t< SPATIAL_DIM > &conv_p, const std::uint8_t *activations, PackWeightsForConv< SPATIAL_DIM, std::int8_t, ACC_T > &packed_weights, typename processOutputType::outType *out, std::int32_t *outBuffer, processOutputType &outProcess, int thread_id, int num_threads, const BlockingFactors *blocking_params=nullptr)
     
    template<int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
    optimized_conv_t ConvFastPath (const conv_param_t< SPATIAL_DIM > &conv_p)
     
    void FloatToBfloat16_ref (const float *src, bfloat16 *dst, size_t size)
     
    void Bfloat16ToFloat_ref (const bfloat16 *src, float *dst, size_t size)
     
    void FloatToBfloat16_simd (const float *src, bfloat16 *dst, size_t size)
     
    void Bfloat16ToFloat_simd (const bfloat16 *src, float *dst, size_t size)
     
    void FloatToFloat16_ref (const float *src, float16 *dst, size_t size, bool do_clip=false)
     
    void Float16ToFloat_ref (const float16 *src, float *dst, size_t size)
     
    void FloatToFloat16_simd (const float *src, float16 *dst, size_t size, bool do_clip=false)
     
    void Float16ToFloat_simd (const float16 *src, float *dst, size_t size)
     
    template<typename InType , typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    EmbeddingSpMDMKernelSignature< InType, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDM (const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true, bool is_bf16_out=false, bool is_bf16_in=false)
     
    template<typename InType , typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    EmbeddingSpMDMKernelSignature< InType, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMWithStrides (const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true, std::int64_t output_stride=-1, std::int64_t input_stride=-1, bool scale_bias_last=true, bool no_bag=false, bool is_bf16_out=false, bool is_bf16_in=false)
     
    template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float>
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMNBit (int bit_rate, const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true)
     
    template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMNBitWithStrides (int bit_rate, const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true, std::int64_t output_stride=-1, std::int64_t input_stride=-1, bool scale_bias_last=true, bool is_bf16_out=false)
     
    template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float>
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMFP8WithStrides (const std::int64_t block_size, bool normalize_by_lengths, bool is_weight_positional=false, bool use_offsets=true, std::int64_t output_stride=-1, std::int64_t input_stride=-1, int exponent_bits=4, int exponent_bias=7, bool is_bf16_out=false)
     
    template<typename InType , typename IndexType , typename OffsetType = std::int32_t>
    EmbeddingSpMDMRowWiseSparseKernelSignature< InType, IndexType, OffsetType >::Type GenerateEmbeddingSpMDMRowWiseSparse (const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true)
     
    template<typename IndexType , typename OffsetType = std::int32_t>
    EmbeddingSpMDMRowWiseSparseKernelSignature< std::uint8_t, IndexType, OffsetType >::Type GenerateEmbeddingSpMDMNBitRowWiseSparse (int bit_rate, const std::int64_t block_size, bool has_weight, bool normalize_by_lengths, int prefetch=16, bool is_weight_positional=false, bool use_offsets=true)
     
    template<typename IndexType , typename OffsetType = std::int32_t, typename DataType = float>
    RowWiseSparseAdaGradFusedSignature< IndexType, OffsetType, DataType >::Type GenerateRowWiseSparseAdaGradFused (int block_size, int prefetch=16, bool use_offsets=true, bool use_stochastic_rounding=true, int grad_stride=-1)
     
    void PackA (int nrow, int ncol, const float *from, int ldim, float *to)
     
    template<QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
    void depthwise_2d_same_pad (int N, int H, int W, int IC, int OC, int stride_h, int stride_w, std::int32_t A_zero_point, const std::uint8_t *A, const std::int32_t *B_zero_point, const PackedDepthWiseConvMatrix &Bp, const float *C_multiplier, std::int32_t C_zero_point, std::uint8_t *C, const std::int32_t *col_offsets, const BIAS_TYPE *bias, bool fuse_relu=false, const float *act_times_w_scale=nullptr, int thread_id=0, int num_threads=1)
     
    template<QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
    void depthwise_3d_same_pad (const conv_param_t< 3 > &conv_p, std::int32_t A_zero_point, const std::uint8_t *A, const std::int32_t *B_zero_point, const PackedDepthWiseConvMatrix &Bp, const float *C_multiplier, std::int32_t C_zero_point, std::uint8_t *C, const std::int32_t *col_offsets, const BIAS_TYPE *bias, bool fuse_relu=false, const float *act_times_w_scale=nullptr, int thread_id=0, int num_threads=1)
     
    void SparseDenseMM (int M, int N, const int *row_ptr, const int *col_idx, const float *values, const float *B, int ldb, float *C, int ldc, bool accum=false)
     
    template<typename T , bool LEGACY = true>
    Quantize (float src, std::int32_t zero_point, float scale, int result_precision, bool result_is_signed=std::is_signed< T >::value)
     
    template<typename T , layout_t LAYOUT = layout_t::KCX>
    void QuantizeGroupwise (const float *src, int K, int C, int X, int G, const float *scales, const std::int32_t *zero_points, T *dst)
     
    template<typename T >
    void FusedQuantizeDequantize (const float *src, float *dst, std::int64_t len, const TensorQuantizationParams &qparams, int thread_id=0, int num_threads=1, float noise_ratio=0.0f)
     
    template<typename InputType >
    void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf (int bit_rate, const InputType *input, size_t input_rows, int input_columns, std::uint8_t *output)
     
    template<typename OutputType >
    void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf (int bit_rate, const uint8_t *input, size_t input_rows, int input_columns, OutputType *output)
     
    template<typename InputType >
    void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat (const InputType *input, size_t input_rows, int input_columns, std::uint8_t *output)
     
    template<typename OutputType >
    void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf (const uint8_t *input, size_t input_rows, int input_columns, OutputType *output)
     
    template<typename InputType >
    void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef (int bit_rate, const InputType *input, size_t input_rows, int input_columns, std::uint8_t *output)
     
    template<typename InputType >
    void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef (const InputType *input, size_t input_rows, int input_columns, std::uint8_t *output)
     
    template<typename OutputType >
    void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef (int bit_rate, const uint8_t *input, size_t input_rows, int input_columns, OutputType *output)
     
    template<typename OutputType >
    void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef (const uint8_t *input, size_t input_rows, int input_columns, OutputType *output)
     
    uint32_t Xor128 (void)
     
    template<bool A_SYMMETRIC, bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, bool FUSE_RELU, typename BIAS_TYPE = std::int32_t, bool DIRECT = false>
    void requantizeOutputProcessingAvx2 (std::uint8_t *out, const std::int32_t *inp, const block_type_t &block, int ld_out, int ld_in, const requantizationParams_t< BIAS_TYPE > &r)
     
    template<bool A_SYMMETRIC, bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, bool FUSE_RELU, int C_PER_G, typename BIAS_TYPE = std::int32_t>
    void requantizeOutputProcessingGConvAvx512 (std::uint8_t *out, const std::int32_t *inp, const block_type_t &block, int ld_out, int ld_in, const requantizationParams_t< BIAS_TYPE > &r)
     
    template<typename T >
    int compare_buffers (const T *ref, const T *test, int m, int n, int ld, size_t max_mismatches_to_report, float atol=1e-3)
     
    template<typename T >
    void printMatrix (matrix_op_t trans, const T *inp, size_t R, size_t C, size_t ld, std::string name)
     
    template<typename T >
    void transpose_simd (int64_t M, int64_t N, const T *src, int64_t ld_src, T *dst, int64_t ld_dst)
     
    void fbgemmForceIsa (inst_set_t)
     
    void fbgemmEnableAvx512Ymm (bool)
     
    inst_set_t fbgemmInstructionSet ()
     
    int fbgemmGet2DPartition (int m, int n, int nthreads, int n_align, double aspect_ratio)
     
    void fbgemmPartition1D (int thread_id, int num_threads, std::int64_t total_work, std::int64_t &start, std::int64_t &end)
     
    void fbgemmPartition1DBlocked (int thread_id, int num_threads, std::int64_t total_work, int block_size, std::int64_t &start, std::int64_t &end)
     
    bool is_autovec_disabled ()
     
    template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx2, int >::type = 0>
    void gen16BitVectorOne (x86::Emitter *a, T dest)
     
    template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx2, int >::type = 0>
    void emitLoadDWord (x86::Emitter *a, T dest, const x86::Mem &ptr)
     
    template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx512||instSet==inst_set_t::avx512_ymm||instSet==inst_set_t::avx512_vnni||instSet==inst_set_t::avx512_vnni_ymm, int >::type = 0>
    void emitExtractHalfVector (x86::Emitter *a, x86::Ymm half, const x86::Zmm vec, int idx)
     
    template<typename T , typename std::enable_if< std::is_same< T, x86::Ymm >::value, int >::type = 0>
    void gen8BitVectorOne (x86::Emitter *a, T dest)
     
    template<inst_set_t INST_SET, typename std::enable_if< INST_SET==inst_set_t::avx2||INST_SET==inst_set_t::avx512, int >::type = 0>
    void genU8I8S32FMA (x86::Emitter *a, typename simd_info< INST_SET >::vec_reg_t aReg, typename simd_info< INST_SET >::vec_reg_t bReg, typename simd_info< INST_SET >::vec_reg_t cReg, typename simd_info< INST_SET >::vec_reg_t oneReg16Bit, typename simd_info< INST_SET >::vec_reg_t tmpReg)
     
    template<inst_set_t INST_SET, typename std::enable_if< INST_SET==inst_set_t::avx2||INST_SET==inst_set_t::avx512, int >::type = 0>
    void genU8Sum4 (x86::Emitter *a, typename simd_info< INST_SET >::vec_reg_t src, typename simd_info< INST_SET >::vec_reg_t dest, typename simd_info< INST_SET >::vec_reg_t oneReg16Bit, typename simd_info< INST_SET >::vec_reg_t tmpReg)
     
    template<typename T >
    void genU8Sum8 (x86::Emitter *a, T src, T dest, T tmpReg)
     
    void initCRegs (x86::Emitter *a, int rowRegs, int colRegs)
     
    void requantize_u8acc32_ref (int M, int N, int ld, const std::int32_t *inp, std::uint8_t *out, std::int32_t C_multiplier, std::int32_t C_right_shift, std::int32_t C_zero_point, std::int32_t A_zero_point, std::int32_t B_zero_point, const std::int32_t *row_offsets, const std::int32_t *col_offsets, const std::int32_t *bias, bool fuse_relu=false)
     
    void requantize_u8acc32_ref (int M, int N, int ld, const std::int32_t *inp, std::uint8_t *out, const float *C_multiplier, std::int32_t C_zero_point, std::int32_t A_zero_point, const std::int32_t *B_zero_point, const std::int32_t *row_offsets, const std::int32_t *col_offsets, const std::int32_t *bias, int ncols_per_quant_group, bool fuse_relu=false)
     
    void col_offsets_with_zero_pt_s8acc32_ref (int K, int N, int ld, const std::int8_t *Bint8, const std::int32_t *B_zero_point, std::int32_t *col_offsets, int ncols_per_quant_group)
     
    void spmdm_ref (int M, const std::uint8_t *A, int lda, CompressedSparseColumn &B, bool accumulation, std::int32_t *C, int ldc, int groups=1)
     
    template<typename IndexType >
    int sparse_adagrad_ref (int num_rows, int block_size, std::uint64_t param_size, float *w, const float *g, float *h, const IndexType *indices, float epsilon, float lr, float weight_decay=0.f, const double *counter=nullptr, const int64_t counter_halflife=0)
     
    template<typename IndexType >
    int rowwise_sparse_adagrad_ref (int num_rows, int block_size, std::uint64_t param_size, float *w, const float *g, float *h, const IndexType *indices, float epsilon, float lr, float weight_decay=0.f, const double *counter=nullptr, const int64_t counter_halflife=0)
     
    template<typename T >
    void transpose_ref (int64_t M, int64_t N, const T *src, int64_t ld_src, T *dst, int64_t ld_dst)
     
    +

    Detailed Description

    +

    Top level include file for FBGEMM.

    +

    Enumeration Type Documentation

    + +

    ◆ impl_type_t

    + +
    +
    + + + + + +
    + + + + +
    enum class impl_type_t
    +
    +strong
    +
    + +

    Typed enum for implementation type.

    +

    ref is reference and opt is optimized.

    + +
    +
    +

    Function Documentation

    + +

    ◆ Bfloat16ToFloat_ref()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void Bfloat16ToFloat_ref (const bfloat16 * src,
    float * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from bfloat16 to fp32: reference implementation.

    + +
    +
    + +

    ◆ Bfloat16ToFloat_simd()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void Bfloat16ToFloat_simd (const bfloat16 * src,
    float * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from bfloat16 to fp32: simd implementation.

    + +
    +
    + +

    ◆ col_offsets_with_zero_pt_s8acc32_ref()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void col_offsets_with_zero_pt_s8acc32_ref (int K,
    int N,
    int ld,
    const std::int8_t * Bint8,
    const std::int32_t * B_zero_point,
    std::int32_t * col_offsets,
    int ncols_per_quant_group )
    +
    + +

    Reference implementation to compute adjusted col_offsets (sum of columns of B and adjusted with B_zero_point)

    +
    Parameters
    + + +
    ncols_per_quant_groupsee ncols_per_quant_group in requantize_u8acc32_ref
    +
    +
    + +
    +
    + +

    ◆ compare_buffers()

    + +
    +
    +
    +template<typename T >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    int compare_buffers (const T * ref,
    const T * test,
    int m,
    int n,
    int ld,
    size_t max_mismatches_to_report,
    float atol )
    +
    + +

    A function to compare data in two buffers for closeness/equality.

    +

    Compare the reference and test result matrix to check the correctness.

    +
    Parameters
    + + + + + + + + +
    refThe buffer for the reference result matrix.
    testThe buffer for the test result matrix.
    mThe height of the reference and test result matrix.
    nThe width of the reference and test result matrix.
    ldThe leading dimension of the reference and test result matrix.
    max_mismatches_to_reportThe maximum number of tolerable mismatches to report.
    atolThe tolerable error.
    +
    +
    +
    Return values
    + + + +
    falseIf the number of mismatches for reference and test result matrix exceeds max_mismatches_to_report.
    trueIf the number of mismatches for reference and test result matrix is tolerable.
    +
    +
    + +
    +
    + +

    ◆ ConvFastPath()

    + +
    +
    +
    +template<int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
    + + + + + + + +
    optimized_conv_t ConvFastPath (const conv_param_t< SPATIAL_DIM > & conv_p)
    +
    + +

    Returns which fast path to take.

    +
    Template Parameters
    + + +
    SPATIAL_DIMIt's 2 for 2D convolutions and 3 for 3D convolutions.
    +
    +
    +
    Returns
    optimized_conv_t::depthwise, optimized_conv_t::groupwise or optimized_conv_t::im2col
    + +
    +
    + +

    ◆ depthwise_2d_same_pad()

    + +
    +
    +
    +template<QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void depthwise_2d_same_pad (int N,
    int H,
    int W,
    int IC,
    int OC,
    int stride_h,
    int stride_w,
    std::int32_t A_zero_point,
    const std::uint8_t * A,
    const std::int32_t * B_zero_point,
    const PackedDepthWiseConvMatrix & Bp,
    const float * C_multiplier,
    std::int32_t C_zero_point,
    std::uint8_t * C,
    const std::int32_t * col_offsets,
    const BIAS_TYPE * bias,
    bool fuse_relu = false,
    const float * act_times_w_scale = nullptr,
    int thread_id = 0,
    int num_threads = 1 )
    +
    +

    Depth-wise convolution that results in the same output feature size as the input feature. That is PAD_T = PAD_B = (R - 1) / 2 and PAD_L = PAD_R = (S - 1) / 2. This function also does requantization.

    Parameters
    + + + +
    col_offsetsnullptr if col_offsets are folded into bias
    act_times_w_scaleOnly used if BIAS_TYPE is float, i.e., bias is unquantized.
    +
    +
    + +
    +
    + +

    ◆ depthwise_3d_same_pad()

    + +
    +
    +
    +template<QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void depthwise_3d_same_pad (const conv_param_t< 3 > & conv_p,
    std::int32_t A_zero_point,
    const std::uint8_t * A,
    const std::int32_t * B_zero_point,
    const PackedDepthWiseConvMatrix & Bp,
    const float * C_multiplier,
    std::int32_t C_zero_point,
    std::uint8_t * C,
    const std::int32_t * col_offsets,
    const BIAS_TYPE * bias,
    bool fuse_relu = false,
    const float * act_times_w_scale = nullptr,
    int thread_id = 0,
    int num_threads = 1 )
    +
    +
    Parameters
    + + +
    col_offsetsnullptr if col_offsets are folded into bias
    +
    +
    + +
    +
    + +

    ◆ emitExtractHalfVector()

    + +
    +
    +
    +template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx512||instSet==inst_set_t::avx512_ymm||instSet==inst_set_t::avx512_vnni||instSet==inst_set_t::avx512_vnni_ymm, int >::type = 0>
    + + + + + + + + + + + + + + + + + + + + + +
    void emitExtractHalfVector (x86::Emitter * a,
    x86::Ymm half,
    const x86::Zmm vec,
    int idx )
    +
    + +

    Emit partial extract from Wide regiter to Half Register, eg. Zmm -> Ymm or Ymm -> Xmm.

    +
    Template Parameters
    + + +
    instSetinstruction set to be used
    +
    +
    +
    Parameters
    + + + + +
    halfDestination (half) vector register
    vecSource (full) vector register
    idxIndex of of the half vector 0 or 1
    +
    +
    + +
    +
    + +

    ◆ emitLoadDWord()

    + +
    +
    +
    +template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx2, int >::type = 0>
    + + + + + + + + + + + + + + + + +
    void emitLoadDWord (x86::Emitter * a,
    T dest,
    const x86::Mem & ptr )
    +
    + +

    Emit instruction do load 32-bit integer. AVX512 has different instrunction to load registers with index >= 16.

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    destDestination vector register
    +
    +
    + +
    +
    + +

    ◆ fbgemmConv()

    + +
    +
    +
    +template<typename processOutputType , int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    int fbgemmConv (const conv_param_t< SPATIAL_DIM > & conv_p,
    const std::uint8_t * activations,
    PackWeightsForConv< SPATIAL_DIM, std::int8_t, ACC_T > & packed_weights,
    typename processOutputType::outType * out,
    std::int32_t * outBuffer,
    processOutputType & outProcess,
    int thread_id,
    int num_threads,
    const BlockingFactors * blocking_params = nullptr )
    +
    + +

    Performs convolution using fastest path available.

    +
    Template Parameters
    + + +
    SPATIAL_DIMIt's 2 for 2D convolutions and 3 for 3D convolutions.
    +
    +
    + +
    +
    + +

    ◆ fbgemmEnableAvx512Ymm()

    + +
    +
    + + + + + + + +
    void fbgemmEnableAvx512Ymm (bool flag)
    +
    + +

    Enable AVX512-256 path for Intel(r) Xeon(r) D servers.

    +

    Enables AVX512-256 if appriate. Inteded for Skylake based Xeon-D processors, wherein AXV512-256 is preferred due to higher Turbo frequencis.

    +
    Parameters
    + + +
    flagTrue enables / False disables
    +
    +
    + +
    +
    + +

    ◆ fbgemmForceIsa()

    + +
    +
    + + + + + + + +
    void fbgemmForceIsa (inst_set_t isa)
    +
    + +

    Explicitly set instruction set to be used.

    +

    Force specific architecure to for GEMM kernel execution overides FBGEMM_ENABLE_AVX512_256 env. variable.

    +
    Parameters
    + + +
    isathe ISA to enforce, supported optionsi AVX2 inst_set_t::avx2 AVX512 inst_set_t::avx512 AVX512_E1 inst_set_t::avx512_vnni AVX512_256 inst_set_t::avx512_ymm AVX512_E1_256 inst_set_t::avx512_vnni_ymm
    +
    +
    + +
    +
    + +

    ◆ fbgemmGet2DPartition()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    int fbgemmGet2DPartition (int m,
    int n,
    int nthreads,
    int n_align,
    double aspect_ratio )
    +
    + +

    A heuristic algorithm to partition the threads across m and n dimensions for parallelization, ensuring the ratio between the number of rows allocated to each thread in the m dimension and the number of columns allocated to each thread in the n dimension is approximately aspect_ratio.

    +

    The less aspect_ratio is, the more favorable it is to parallelize the m dimension over the n dimension.

    + +
    +
    + +

    ◆ fbgemmGroupwiseConv()

    + +
    +
    +
    +template<typename packed_W , typename outType , bool FUSE_RELU, QuantizationGranularity Q_GRAN, int SPATIAL_DIM = 2, typename BIAS_TYPE = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void fbgemmGroupwiseConv (const conv_param_t< SPATIAL_DIM > & conv_param,
    const std::uint8_t * activations,
    std::int32_t a_zero_point,
    std::int32_t * rowOffsetBuf,
    packed_W & packed_weights,
    outType * out,
    std::int32_t * outBuffer,
    const ReQuantizeOutput< FUSE_RELU, Q_GRAN, BIAS_TYPE > & outProcess,
    int thread_id,
    int num_threads )
    +
    + +

    Perform small-channels-per-group groupwise convolution Note: Currently threading is not supported. This function does nothing for thread_ids > 0, i.e., returns early.

    +
    Parameters
    + + +
    rowOffsetBufnullptr if B uses symmetric quantization Note: Currently threading is not supported. This function does nothing for thread_ids > 0, i.e., returns early.
    +
    +
    + +
    +
    + +

    ◆ fbgemmInstructionSet()

    + +
    +
    + + + + + + + +
    inst_set_t fbgemmInstructionSet ()
    +
    + +

    Retrieve current CPU instruction set.

    +

    Determine the best available x86 machine ISA to be used for GEMM kernels. FBGEMM_ENABLE_AVX512_256 env. or fbgemmForceIsa() are set forces to specific architecture if supported by the processor. Enforcing on Skylake to AVX2 will execute AVX2 version of the kernel However, enforcing AVX512-256 on Broadwell will fail, and AVX2 version of the kernels will be executed.

    + +
    +
    + +

    ◆ fbgemmPacked()

    + +
    +
    +
    +template<typename packingAMatrix , typename packingBMatrix , typename cT , typename processOutputType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void fbgemmPacked (PackMatrix< packingAMatrix, typename packingAMatrix::inpType, typename packingAMatrix::accType > & packA,
    PackMatrix< packingBMatrix, typename packingBMatrix::inpType, typename packingBMatrix::accType > & packB,
    cT * C,
    std::int32_t * C_buffer,
    std::uint32_t ldc,
    const processOutputType & outProcess,
    int thread_id,
    int num_threads,
    const BlockingFactors * blocking_params = nullptr )
    +
    +

    Matrix B must be prepacked. For matrix A, packA.pack function is called to pack it.

    +
    Template Parameters
    + + + + + +
    packingAMatrixprocessing of A matrix while packing, e.g., PackAWithQuantRowOffset
    packingBMatrixprocessing of B matrix while packing, e.g., pre-multiply by alpha
    cTdata type of C matrix
    processOutputTypefurther processing of outputs, e.g., Relu
    +
    +
    + +
    +
    + +

    ◆ fbgemmPartition1D()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void fbgemmPartition1D (int thread_id,
    int num_threads,
    std::int64_t total_work,
    std::int64_t & start,
    std::int64_t & end )
    +
    + +

    Partition work across given number of threads.

    +
    Parameters
    + + + +
    startGiven thread_id should execute starting from the index start
    stopGiven thread_id should stop executing at the index stop
    +
    +
    +

    i.e., the loop should be equivalent to for(int i = start; i < end; ++i)

    + +
    +
    + +

    ◆ fbgemmPartition1DBlocked()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void fbgemmPartition1DBlocked (int thread_id,
    int num_threads,
    std::int64_t total_work,
    int block_size,
    std::int64_t & start,
    std::int64_t & end )
    +
    + +

    Partition work across given number of threads in blocks of size block_size. Each thread gets a multiple of block_size work or nothing, except the last one. The last one might receive the fringe case.

    +
    Parameters
    + + + +
    startGiven thread_id should execute starting from the index start
    stopGiven thread_id should stop executing at the index stop
    +
    +
    +

    The loop can be equivalent to for(int i = start; i < end; i+=block_size) except for the last thread. (i.e., thread_id = num_threads - 1)

    +

    Example 1: block_size = 2, num_threads = 2 total_work start(th 0) end(th 0) start(th 1) end(th 1) 4 0 2 2 4 5 0 2 2 5

    +

    Example 2: block_size = 2, num_threads = 3 total_work start(th 0) end(th 0) start(th 1) end(th 1) 4 0 2 2 4 5 0 2 2 4

    +

    total_work start(th 2) end(th 2) 4 4 4 5 4 5

    +

    Example 3: block_size = 2, num_threads = 4 total_work start(th 0) end(th 0) start(th 1) end(th 1) 4 0 2 2 4 5 0 2 2 4

    +

    total_work start(th 2) end(th 2) start(th 3) end(th 3) 4 4 4 4 4 5 4 4 4 5

    + +
    +
    + +

    ◆ Float16ToFloat_ref()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void Float16ToFloat_ref (const float16 * src,
    float * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from float16 to fp32: reference implementation.

    + +
    +
    + +

    ◆ Float16ToFloat_simd()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void Float16ToFloat_simd (const float16 * src,
    float * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from float16 to fp32: simd implementation.

    + +
    +
    + +

    ◆ FloatOrHalfToFused8BitRowwiseQuantizedSBFloat()

    + +
    +
    +
    +template<typename InputType >
    + + + + + + + + + + + + + + + + + + + + + +
    void FloatOrHalfToFused8BitRowwiseQuantizedSBFloat (const InputType * input,
    size_t input_rows,
    int input_columns,
    std::uint8_t * output )
    +
    +

    Convert float or half inputs to rowwise quantized (8-bit) outputs. Scale and Bias are in float. Each row's Scale and Bias are stored in the row itself (fused) at the end.

    +

    This version intentionally supports only 8-bit because we want to discourage the usage of float scale and bias with 2 and 4 bit cases as that diminishes the overall memory savings.

    + +
    +
    + +

    ◆ FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef()

    + +
    +
    +
    +template<typename InputType >
    + + + + + + + + + + + + + + + + + + + + + +
    void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatRef (const InputType * input,
    size_t input_rows,
    int input_columns,
    std::uint8_t * output )
    +
    +

    Same as FloatOrHalfToFused8BitRowwiseQuantizedSBFloat but unoptimized. This should not be called directly except in testing.

    + +
    +
    + +

    ◆ FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef()

    + +
    +
    +
    +template<typename InputType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfRef (int bit_rate,
    const InputType * input,
    size_t input_rows,
    int input_columns,
    std::uint8_t * output )
    +
    +

    Same as ToFusedNBitRowwiseQuantizedSBHalf but unoptimized. This should not be called directly except in testing.

    + +
    +
    + +

    ◆ FloatToBfloat16_ref()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void FloatToBfloat16_ref (const float * src,
    bfloat16 * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from fp32 to bfloat16: reference implementation.

    + +
    +
    + +

    ◆ FloatToBfloat16_simd()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void FloatToBfloat16_simd (const float * src,
    bfloat16 * dst,
    size_t size )
    +
    +

    @ Transform all entries in a matrix from fp32 to bfloat16: simd implementation.

    + +
    +
    + +

    ◆ FloatToFloat16_ref()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + +
    void FloatToFloat16_ref (const float * src,
    float16 * dst,
    size_t size,
    bool do_clip = false )
    +
    +

    @ Transform all entries in a matrix from fp32 to float16: reference implementation.

    +
    Parameters
    + + +
    do_clipif true we saturate to fp16 min and max instead of generating infinities.
    +
    +
    + +
    +
    + +

    ◆ FloatToFloat16_simd()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + +
    void FloatToFloat16_simd (const float * src,
    float16 * dst,
    size_t size,
    bool do_clip = false )
    +
    +

    @ Transform all entries in a matrix from fp32 to float16: simd implementation.

    +
    Parameters
    + + +
    do_clipif true we saturate to fp16 min and max instead of generating infinities.
    +
    +
    + +
    +
    + +

    ◆ Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf()

    + +
    +
    +
    +template<typename OutputType >
    + + + + + + + + + + + + + + + + + + + + + +
    void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf (const uint8_t * input,
    size_t input_rows,
    int input_columns,
    OutputType * output )
    +
    +

    Convert fused rowwise quantized (8-bit) inputs to float or half outputs. Scale and Bias are in float. Each row's Scale and Bias are stored in the row itself (fused) at the end.

    +

    This version intentionally supports only 8-bit because the corresponding quantize version only supports 8-bit.

    + +
    +
    + +

    ◆ Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef()

    + +
    +
    +
    +template<typename OutputType >
    + + + + + + + + + + + + + + + + + + + + + +
    void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfRef (const uint8_t * input,
    size_t input_rows,
    int input_columns,
    OutputType * output )
    +
    +

    Same as Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf but unoptimized. This should not be called directly except in testing.

    + +
    +
    + +

    ◆ FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf()

    + +
    +
    +
    +template<typename OutputType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf (int bit_rate,
    const uint8_t * input,
    size_t input_rows,
    int input_columns,
    OutputType * output )
    +
    +

    Convert fused rowwise quantized inputs to float (fp32 or fp16). bitrate specifies the number of bits in quantized input. Scale and Bias are in fp16. Each row's Scale and Bias are stored in the row itself (fused) at the end.

    +
    Parameters
    + + +
    bit_ratecan be 2, 4, or 8
    +
    +
    + +
    +
    + +

    ◆ FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef()

    + +
    +
    +
    +template<typename OutputType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfRef (int bit_rate,
    const uint8_t * input,
    size_t input_rows,
    int input_columns,
    OutputType * output )
    +
    +

    Same as FusedNBitRowwiseQuantizedSBHalfToFloat but unoptimized. This should not be called directly except in testing.

    + +
    +
    + +

    ◆ gen16BitVectorOne()

    + +
    +
    +
    +template<inst_set_t instSet, typename T , typename std::enable_if< instSet==inst_set_t::avx2, int >::type = 0>
    + + + + + + + + + + + +
    void gen16BitVectorOne (x86::Emitter * a,
    T dest )
    +
    + +

    Create instruction sequence to generate 16-bit 1s.

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    destOnce the instruction sequence is executed, dest[0:15] will have 0x0001, dest[16:31] will have 0x0001 and so on
    +
    +
    + +
    +
    + +

    ◆ gen8BitVectorOne()

    + +
    +
    +
    +template<typename T , typename std::enable_if< std::is_same< T, x86::Ymm >::value, int >::type = 0>
    + + + + + + + + + + + +
    void gen8BitVectorOne (x86::Emitter * a,
    T dest )
    +
    + +

    Create instruction sequence to generate 8-bit 1s.

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    destOnce the instruction sequence is executed, dest[0:7] will have 0x01, dest[8:15] will have 0x01 and so on
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDM()

    + +
    +
    +
    +template<typename InType , typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMKernelSignature< InType, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDM (const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true,
    bool is_bf16_out = false,
    bool is_bf16_in = false )
    +
    +
    Template Parameters
    + + + + +
    InTypecan be float, float16, or uint8_t
    IndexTypecan be int32_t or int64_t
    IndexTypecan be int32_t or int64_t
    +
    +
    +
    Parameters
    + + +
    use_offsetsIf true, the generated code assumes we will pass offsets instead of lengths that confirms PyTorch EmbeddingBag interface. In this case, the length of offsets array should be output_size + 1 and offsets[output_size] should be index_size. If false, the generate code assumes we will pass lengths that confirms Caffe2 SparseLengthsSum interface.
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMFP8WithStrides()

    + +
    +
    +
    +template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMFP8WithStrides (const std::int64_t block_size,
    bool normalize_by_lengths,
    bool is_weight_positional = false,
    bool use_offsets = true,
    std::int64_t output_stride = -1,
    std::int64_t input_stride = -1,
    int exponent_bits = 4,
    int exponent_bias = 7,
    bool is_bf16_out = false )
    +
    +
    Parameters
    + + + + + +
    output_strideIf -1, output_stride is same as block_size
    input_stridein Bytes. If -1, input_stride is same as block_size / num_elem_per_byte + 2 * sizeof(float16)
    exponent_bitsis the number of exponent bits in the FP8 encode (normally 4 or 5)
    exponent_biasis subtracted from the exponent to obtain the actual exponent for the floating-point number
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMNBit()

    + +
    +
    +
    +template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMNBit (int bit_rate,
    const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true )
    +
    +
    Template Parameters
    + + + +
    IndexTypecan be int32_t or int64_t
    OffsetTypecan be int32_t or int64_t
    +
    +
    +
    Parameters
    + + +
    bit_ratecan be 2 or 4
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMNBitRowWiseSparse()

    + +
    +
    +
    +template<typename IndexType , typename OffsetType = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMRowWiseSparseKernelSignature< std::uint8_t, IndexType, OffsetType >::Type GenerateEmbeddingSpMDMNBitRowWiseSparse (int bit_rate,
    const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true )
    +
    +
    Template Parameters
    + + + +
    IndexTypecan be int32_t or int64_t
    OffsetTypecan be int32_t or int64_t
    +
    +
    +
    Parameters
    + + +
    bit_ratecan be 2 or 4
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMNBitWithStrides()

    + +
    +
    +
    +template<typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMKernelSignature< std::uint8_t, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMNBitWithStrides (int bit_rate,
    const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true,
    std::int64_t output_stride = -1,
    std::int64_t input_stride = -1,
    bool scale_bias_last = true,
    bool is_bf16_out = false )
    +
    +
    Parameters
    + + + + +
    output_strideIf -1, output_stride is same as block_size
    input_stridein Bytes. If -1, input_stride is same as block_size / num_elem_per_byte + 2 * sizeof(float16)
    scale_bias_lastif false, scale and bias appear at the beginning of each row and are in fp16 for table batched embedding (TBE) in FBGEMM_GPU. If false, it can also take -1 indices (output from pruned embedding id mapping)
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMRowWiseSparse()

    + +
    +
    +
    +template<typename InType , typename IndexType , typename OffsetType = std::int32_t>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMRowWiseSparseKernelSignature< InType, IndexType, OffsetType >::Type GenerateEmbeddingSpMDMRowWiseSparse (const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true )
    +
    +
    Template Parameters
    + + + + +
    InTypecan be float, float16, or uint8_t
    IndexTypecan be int32_t or int64_t
    OffsetTypecan be int32_t or int64_t
    +
    +
    + +
    +
    + +

    ◆ GenerateEmbeddingSpMDMWithStrides()

    + +
    +
    +
    +template<typename InType , typename IndexType , typename OffsetType = std::int32_t, typename OutType = float, bool THREAD_LOCAL = false>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    EmbeddingSpMDMKernelSignature< InType, IndexType, OffsetType, OutType >::Type GenerateEmbeddingSpMDMWithStrides (const std::int64_t block_size,
    bool has_weight,
    bool normalize_by_lengths,
    int prefetch = 16,
    bool is_weight_positional = false,
    bool use_offsets = true,
    std::int64_t output_stride = -1,
    std::int64_t input_stride = -1,
    bool scale_bias_last = true,
    bool no_bag = false,
    bool is_bf16_out = false,
    bool is_bf16_in = false )
    +
    +
    Parameters
    + + + + +
    output_strideIf -1, output_stride is same as block_size
    input_strideIf -1, input_stride is same as block_size
    scale_bias_lastif false, scale and bias appear at the beginning of each row and are in fp16 for table batched embedding (TBE) in FBGEMM_GPU. If false, it can also take -1 indices (output from pruned embedding id mapping)
    +
    +
    + +
    +
    + +

    ◆ GenerateRowWiseSparseAdaGradFused()

    + +
    +
    +
    +template<typename IndexType , typename OffsetType = std::int32_t, typename DataType = float>
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    RowWiseSparseAdaGradFusedSignature< IndexType, OffsetType, DataType >::Type GenerateRowWiseSparseAdaGradFused (int block_size,
    int prefetch = 16,
    bool use_offsets = true,
    bool use_stochastic_rounding = true,
    int grad_stride = -1 )
    +
    +
    Parameters
    + + +
    grad_strideIf -1, grad_stride is same as block size
    +
    +
    + +
    +
    + +

    ◆ genU8I8S32FMA()

    + +
    +
    +
    +template<inst_set_t INST_SET, typename std::enable_if< INST_SET==inst_set_t::avx2||INST_SET==inst_set_t::avx512, int >::type = 0>
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void genU8I8S32FMA (x86::Emitter * a,
    typename simd_info< INST_SET >::vec_reg_t aReg,
    typename simd_info< INST_SET >::vec_reg_t bReg,
    typename simd_info< INST_SET >::vec_reg_t cReg,
    typename simd_info< INST_SET >::vec_reg_t oneReg16Bit,
    typename simd_info< INST_SET >::vec_reg_t tmpReg )
    +
    + +

    Generates instruction sequence to compute s32 += U8 * I8.

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    cRegcontains result
    +
    +
    + +
    +
    + +

    ◆ genU8Sum4()

    + +
    +
    +
    +template<inst_set_t INST_SET, typename std::enable_if< INST_SET==inst_set_t::avx2||INST_SET==inst_set_t::avx512, int >::type = 0>
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void genU8Sum4 (x86::Emitter * a,
    typename simd_info< INST_SET >::vec_reg_t src,
    typename simd_info< INST_SET >::vec_reg_t dest,
    typename simd_info< INST_SET >::vec_reg_t oneReg16Bit,
    typename simd_info< INST_SET >::vec_reg_t tmpReg )
    +
    + +

    Add 4 consecutive numbers of type uint8 and emit their sum as 32-bit numbers. i.e., dest[0:31] contains src[0:7] + src[8:15] + src[16:23] + src[24:31].

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    destcontains result
    +
    +
    + +
    +
    + +

    ◆ genU8Sum8()

    + +
    +
    +
    +template<typename T >
    + + + + + + + + + + + + + + + + + + + + + +
    void genU8Sum8 (x86::Emitter * a,
    T src,
    T dest,
    T tmpReg )
    +
    + +

    Add 8 consecutive numbers of type uint8 and emit their sum as 16-bit numbers. i.e., dest[0:15] contains src[0:7] + src[8:15] + src[16:23] + src[24:31] src[32:39] + src[40:47] + src[48:55] + src[56:63].

    +

    and

    +

    dest[64:79] contains src[64:71] + src[71:79] + src[80:87] + src[88:95] src[96:103] + src[104:111] + src[112:119] + src[120:127]

    +

    so on

    +
    Template Parameters
    + + +
    TRegister type of destination, e.g., x86::Ymm or x86::Zmm
    +
    +
    +
    Parameters
    + + +
    destcontains result
    +
    +
    + +
    +
    + +

    ◆ initCRegs()

    + +
    +
    + + + + + + + + + + + + + + + + +
    void initCRegs (x86::Emitter * a,
    int rowRegs,
    int colRegs )
    +
    + +

    Generate instructions for initializing the C registers to 0.

    +

    Generate instructions for initializing the C registers to 0 in 32-bit Accumulation kernel.

    + +
    +
    + +

    ◆ is_autovec_disabled()

    + +
    +
    + + + + + + + +
    bool is_autovec_disabled ()
    +
    +

    Choosing which kernel (autovec/asmjit/ref) to use for nbit-CPU-TBE Available kernels:

      +
    • ref: non-optimized, reference implementation that focuses on correctness, not performance
    • +
    • asmjit: hand-optimized kernel by having asmjit emit SIMD instructions during runtime. Only supports x86_64 CPUs with AVX2/AVX512 instruction sets
    • +
    • autovec: the kernel written in regular C++ code but in a way that makes compilers easier to generate vectorized SIMD instructions out of it. Supports both x86_64 and aarch64 CPUs. Currently only available on Linux. How to set environment variables:
    • +
    • No environment variables: on x86_64 we will default to asmjit kernel, and on aarch64 and linux we will default to autovec. On non-linux aarch64 we will fall back to ref.
    • +
    • Set FBGEMM_NO_AUTOVEC: on aarch64 linux we will use ref. On other platforms this will have no effect.
    • +
    • Set FBGEMM_NO_ASMJIT: on x86_64 we will use ref. On other platforms this will have no effect.
    • +
    • Set FBGEMM_NO_ASMJIT AND FBGEMM_FORCE_AUTOVEC: on x86_64 we will use autovec if these two variables are set at the same time. No effect on other platforms.
    • +
    • FBGEMM_FORCE_AUTOVEC will override FBGEMM_NO_AUTOVEC if they are set at the same time.
    • +
    • These variables are considered set as long as they exist regardless of content. That means assigning values like "1", "true", "y", "0", "false" or "no" has the same effect. The easiest way of setting a variable is to prepend <VARIABLE>=1 before the benchmarking command.
    • +
    + +
    +
    + +

    ◆ PackA()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void PackA (int nrow,
    int ncol,
    const float * from,
    int ldim,
    float * to )
    +
    + +

    Todo: make it fast with AVX2 transpose.

    +

    class that performs packing of matrix in row-major or col-major format into internal packed blocked-row major format

    + +
    +
    + +

    ◆ printMatrix()

    + +
    +
    +
    +template<typename T >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void printMatrix (matrix_op_t op,
    const T * inp,
    size_t R,
    size_t C,
    size_t ld,
    std::string name )
    +
    + +

    Debugging helper.

    +

    Print the matrix.

    +
    Parameters
    + + + + + + +
    opTranspose type of the matrix.
    RThe height of the matrix.
    CThe width of the matrix.
    ldThe leading dimension of the matrix.
    nameThe prefix string before printing the matrix.
    +
    +
    + +
    +
    + +

    ◆ Quantize()

    + +
    +
    +
    +template<typename T , bool LEGACY = true>
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    T Quantize (float src,
    std::int32_t zero_point,
    float scale,
    int result_precision,
    bool result_is_signed = std::is_signed<T>::value )
    +
    +

    Quantize src using zero_point and scale, clamp to the specified precision, and convert it to type T

    + +
    +
    + +

    ◆ requantize_u8acc32_ref() [1/2]

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void requantize_u8acc32_ref (int M,
    int N,
    int ld,
    const std::int32_t * inp,
    std::uint8_t * out,
    const float * C_multiplier,
    std::int32_t C_zero_point,
    std::int32_t A_zero_point,
    const std::int32_t * B_zero_point,
    const std::int32_t * row_offsets,
    const std::int32_t * col_offsets,
    const std::int32_t * bias,
    int ncols_per_quant_group,
    bool fuse_relu = false )
    +
    + +

    Reference implementation of requantization step. float multiplier.

    +
    Parameters
    + + + +
    biascan be nullptr
    ncols_per_quant_groupthe number of columns share the same quantization parameter. ncols_per_quant_group == N : per-tensor quantization ncols_per_quant_group == N / groups : per-group quantization ncols_per_quant_group == 1 : per-channel quantization
    +
    +
    + +
    +
    + +

    ◆ requantize_u8acc32_ref() [2/2]

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void requantize_u8acc32_ref (int M,
    int N,
    int ld,
    const std::int32_t * inp,
    std::uint8_t * out,
    std::int32_t C_multiplier,
    std::int32_t C_right_shift,
    std::int32_t C_zero_point,
    std::int32_t A_zero_point,
    std::int32_t B_zero_point,
    const std::int32_t * row_offsets,
    const std::int32_t * col_offsets,
    const std::int32_t * bias,
    bool fuse_relu = false )
    +
    + +

    Reference implementation of requantization step. int32 multiplier.

    +
    Parameters
    + + +
    biascan be nullptr
    +
    +
    + +
    +
    + +

    ◆ rowOffsetBufferSizeGConv()

    + +
    +
    +
    +template<int SPATIAL_DIM = 2>
    + + + + + + + +
    int rowOffsetBufferSizeGConv (const conv_param_t< SPATIAL_DIM > & conv_param)
    +
    +
    Returns
    Size of row offset buffer in number of elements needed for fbgemmGroupwiseConv
    + +
    +
    + +

    ◆ rowwise_sparse_adagrad_ref()

    + +
    +
    +
    +template<typename IndexType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    int rowwise_sparse_adagrad_ref (int num_rows,
    int block_size,
    std::uint64_t param_size,
    float * w,
    const float * g,
    float * h,
    const IndexType * indices,
    float epsilon,
    float lr,
    float weight_decay = 0.f,
    const double * counter = nullptr,
    const int64_t counter_halflife = 0 )
    +
    +
    Parameters
    + + + + + + + + + + +
    num_rowsnumber of rows reading
    block_sizenumber of parameters per rows
    param_sizetotal number of parameters
    winput parameters
    ginput gradients
    hinput momentum
    indicesindices of each row
    counterused for weight_decay adjusted for frequency. nullptr when frequency adjustment is not used. Ignored when weight_decay == 0
    counter_halflifeweight_decay is adjusted only after this number of iterations
    +
    +
    + +
    +
    + +

    ◆ sparse_adagrad_ref()

    + +
    +
    +
    +template<typename IndexType >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    int sparse_adagrad_ref (int num_rows,
    int block_size,
    std::uint64_t param_size,
    float * w,
    const float * g,
    float * h,
    const IndexType * indices,
    float epsilon,
    float lr,
    float weight_decay = 0.f,
    const double * counter = nullptr,
    const int64_t counter_halflife = 0 )
    +
    +
    Parameters
    + + + + + + + + + + +
    num_rowsnumber of rows reading
    block_sizenumber of parameters per rows
    param_sizetotal number of parameters
    winput parameters
    ginput gradients
    hinput momentum
    indicesindices of each row
    counterused for weight_decay adjusted for frequency. nullptr when frequency adjustment is not used. Ignored when weight_decay == 0
    counter_halflifeweight_decay is adjusted only after this number of iterations
    +
    +
    + +
    +
    + +

    ◆ SparseDenseMM()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void SparseDenseMM (int M,
    int N,
    const int * row_ptr,
    const int * col_idx,
    const float * values,
    const float * B,
    int ldb,
    float * C,
    int ldc,
    bool accum = false )
    +
    +
    Parameters
    + + +
    accumControls accumulation. 1 means we're accumulating to the C Matrix.
    +
    +
    +

    Note on matrix order and layout: Unlike other fbgemm functions that follow PyTorch convention where A matrix is activation (so in uint8_t for quantized FC/Conv or fp32) and B matrix is weight (so in int8_t for quantized FC/Conv or fp32), here A is weight matrix. This is because we mostly target sparsity in weights and for row-major layout it's more efficient to have A as a sparse matrix: for each non-zero of A at ith row and kth column, we can access kth row of B, whose elements are contiguous in memory. If B matrix was sparse, for each non-zero of B at kth row and jth column, we would've needed to access kth column of A, whose elements are not contiguous in memory with C/C++'s row-major layout. Alternatively, we can call this function as if we're computing C^T = B^T * A^T while maintaining PyTorch's convention that the lefthand side matrix B is activation. If B matrix is in column-major layout, we don't need to do an extra transposition. The C matrix will be output in column-major layout, so if we have a back-to-back Sparse-Dense matrix-matrix multiplications, B matrices of subsequent matrices will be already in column-major layout. Refer to SparseDenseMMFP32Benchmark.cc for an example.

    + +
    +
    + +

    ◆ spmdm_ref()

    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void spmdm_ref (int M,
    const std::uint8_t * A,
    int lda,
    CompressedSparseColumn & B,
    bool accumulation,
    std::int32_t * C,
    int ldc,
    int groups = 1 )
    +
    + +

    Reference implementation of SPMDM (sparse matrix times dense matrix).

    +
    Parameters
    + + +
    groupswhen > 1, for gth group, we multiply A[:,g*(A.ncols/groups):(g+1)*(A.ncols/groups)] sub-matrix with B[:,g*(B.ncols/groups):(g+1)*(B.ncols/groups)] sub-matrix .
    +
    +
    + +
    +
    + +

    ◆ transpose_ref()

    + +
    +
    +
    +template<typename T >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void transpose_ref (int64_t M,
    int64_t N,
    const T * src,
    int64_t ld_src,
    T * dst,
    int64_t ld_dst )
    +
    + +

    Reference implementation of matrix transposition: B = A^T.

    +
    Parameters
    + + + + + + + +
    MThe height of the matrix.
    NThe width of the matrix.
    srcThe memory buffer of the source matrix A.
    ld_srcThe leading dimension of the source matrix A.
    dstThe memory buffer of the destination matrix B.
    ld_dstThe leading dimension of the destination matrix B.
    +
    +
    + +
    +
    + +

    ◆ transpose_simd()

    + +
    +
    +
    +template<typename T >
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    void transpose_simd (int64_t M,
    int64_t N,
    const T * src,
    int64_t ld_src,
    T * dst,
    int64_t ld_dst )
    +
    + +

    Transpose a matrix.

    +
    Parameters
    + + + +
    Mthe number of rows of input matrix
    Nthe number of columns of input matrix
    +
    +
    + +
    +
    +
    + + + + diff --git a/namespacemembers.html b/namespacemembers.html new file mode 100644 index 000000000..baa0f4c8a --- /dev/null +++ b/namespacemembers.html @@ -0,0 +1,244 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
    +
    + + + + + + +
    +
    fbgemm_gpu +
    +
    +
    + + + + + + + +
    + +
    +
    + + +
    +
    +
    +
    +
    +
    Loading...
    +
    Searching...
    +
    No Matches
    +
    +
    +
    +
    + +
    +
    Here is a list of all documented namespace members with links to the namespaces they belong to:
    + +

    - b -

    + + +

    - c -

    + + +

    - d -

    + + +

    - e -

    + + +

    - f -

    + + +

    - g -

    + + +

    - i -

    + + +

    - m -

    + + +

    - o -

    + + +

    - p -

    + + +

    - q -

    + + +

    - r -

    + + +

    - s -

    + + +

    - t -

    + + +

    - x -

    +
    + + + + diff --git a/namespacemembers_enum.html b/namespacemembers_enum.html new file mode 100644 index 000000000..062e44351 --- /dev/null +++ b/namespacemembers_enum.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
    +
    + + + + + + +
    +
    fbgemm_gpu +
    +
    +
    + + + + + + + +
    + +
    +
    + + +
    +
    +
    +
    +
    +
    Loading...
    +
    Searching...
    +
    No Matches
    +
    +
    +
    +
    + +
    +
    Here is a list of all documented namespace enums with links to the namespaces they belong to:
    +
    + + + + diff --git a/namespacemembers_func.html b/namespacemembers_func.html new file mode 100644 index 000000000..0bd201156 --- /dev/null +++ b/namespacemembers_func.html @@ -0,0 +1,236 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
    +
    + + + + + + +
    +
    fbgemm_gpu +
    +
    +
    + + + + + + + +
    + +
    +
    + + +
    +
    +
    +
    +
    +
    Loading...
    +
    Searching...
    +
    No Matches
    +
    +
    +
    +
    + +
    +
    Here is a list of all documented namespace functions with links to the namespaces they belong to:
    + +

    - b -

    + + +

    - c -

    + + +

    - d -

    + + +

    - e -

    + + +

    - f -

    + + +

    - g -

    + + +

    - i -

    + + +

    - m -

    + + +

    - p -

    + + +

    - q -

    + + +

    - r -

    + + +

    - s -

    + + +

    - t -

    + + +

    - x -

    +
    + + + + diff --git a/objects.inv b/objects.inv index ee023fc7a..567cb56ec 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/py-modindex.html b/py-modindex.html index b29ef9189..8d1005ba0 100644 --- a/py-modindex.html +++ b/py-modindex.html @@ -299,6 +299,7 @@
  • Combine Input Operators
  • Layout Transformation Operators
  • Embedding Operators
  • +
  • Experimental Operators
  • FBGEMM_GPU Python API

    FBGEMM_GPU Python API

    diff --git a/structfbgemm_1_1_b_c_s_r_matrix.html b/structfbgemm_1_1_b_c_s_r_matrix.html index b4dc019f8..b2796577b 100644 --- a/structfbgemm_1_1_b_c_s_r_matrix.html +++ b/structfbgemm_1_1_b_c_s_r_matrix.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_blocking_factors-members.html b/structfbgemm_1_1_blocking_factors-members.html index 454ab0188..bdddba653 100644 --- a/structfbgemm_1_1_blocking_factors-members.html +++ b/structfbgemm_1_1_blocking_factors-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_blocking_factors.html b/structfbgemm_1_1_blocking_factors.html index 684ea9ff9..707cee4e5 100644 --- a/structfbgemm_1_1_blocking_factors.html +++ b/structfbgemm_1_1_blocking_factors.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_packing_traits.html b/structfbgemm_1_1_packing_traits.html index dd10abb86..c59cfa8a8 100644 --- a/structfbgemm_1_1_packing_traits.html +++ b/structfbgemm_1_1_packing_traits.html @@ -71,7 +71,7 @@
    @@ -79,7 +79,7 @@

    Detailed Description

    -
    template<typename T, typename accT, inst_set_t instSet, typename int8Type = void>
    +
    template<typename T, typename accT, inst_set_t instSet, typename int8Type = void>
    struct fbgemm::PackingTraits< T, accT, instSet, int8Type >
    Template Parameters
    diff --git a/structfbgemm_1_1_requantization_params-members.html b/structfbgemm_1_1_requantization_params-members.html index b1df7f70c..094edeca0 100644 --- a/structfbgemm_1_1_requantization_params-members.html +++ b/structfbgemm_1_1_requantization_params-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_requantization_params.html b/structfbgemm_1_1_requantization_params.html index df49c8090..2a84c2693 100644 --- a/structfbgemm_1_1_requantization_params.html +++ b/structfbgemm_1_1_requantization_params.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_tensor_quantization_params-members.html b/structfbgemm_1_1_tensor_quantization_params-members.html index b291fa77c..9c1661b20 100644 --- a/structfbgemm_1_1_tensor_quantization_params-members.html +++ b/structfbgemm_1_1_tensor_quantization_params-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1_tensor_quantization_params.html b/structfbgemm_1_1_tensor_quantization_params.html index 5d442e263..b8fffd460 100644 --- a/structfbgemm_1_1_tensor_quantization_params.html +++ b/structfbgemm_1_1_tensor_quantization_params.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1block__type__t-members.html b/structfbgemm_1_1block__type__t-members.html index 642731189..ef3abb0c8 100644 --- a/structfbgemm_1_1block__type__t-members.html +++ b/structfbgemm_1_1block__type__t-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1block__type__t.html b/structfbgemm_1_1block__type__t.html index 0d349b6cd..165103a53 100644 --- a/structfbgemm_1_1block__type__t.html +++ b/structfbgemm_1_1block__type__t.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1conv__param__t-members.html b/structfbgemm_1_1conv__param__t-members.html index 16462957b..4f6922709 100644 --- a/structfbgemm_1_1conv__param__t-members.html +++ b/structfbgemm_1_1conv__param__t-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1conv__param__t.html b/structfbgemm_1_1conv__param__t.html index 5de7862e7..4eb99ed8f 100644 --- a/structfbgemm_1_1conv__param__t.html +++ b/structfbgemm_1_1conv__param__t.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1is__8bit-members.html b/structfbgemm_1_1is__8bit-members.html index 3c554ab49..492824906 100644 --- a/structfbgemm_1_1is__8bit-members.html +++ b/structfbgemm_1_1is__8bit-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1is__8bit.html b/structfbgemm_1_1is__8bit.html index 1528420cf..cc9ed8044 100644 --- a/structfbgemm_1_1is__8bit.html +++ b/structfbgemm_1_1is__8bit.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1requantization_for_float_params__t-members.html b/structfbgemm_1_1requantization_for_float_params__t-members.html index ffbcc9887..1a943ceb4 100644 --- a/structfbgemm_1_1requantization_for_float_params__t-members.html +++ b/structfbgemm_1_1requantization_for_float_params__t-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1requantization_for_float_params__t.html b/structfbgemm_1_1requantization_for_float_params__t.html index 0f925c5e2..8f4c2f6db 100644 --- a/structfbgemm_1_1requantization_for_float_params__t.html +++ b/structfbgemm_1_1requantization_for_float_params__t.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1requantization_params__t-members.html b/structfbgemm_1_1requantization_params__t-members.html index 1f1b28dd1..3f81214da 100644 --- a/structfbgemm_1_1requantization_params__t-members.html +++ b/structfbgemm_1_1requantization_params__t-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1requantization_params__t.html b/structfbgemm_1_1requantization_params__t.html index 2ce92ddf3..d9f33dc58 100644 --- a/structfbgemm_1_1requantization_params__t.html +++ b/structfbgemm_1_1requantization_params__t.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1simd__info.html b/structfbgemm_1_1simd__info.html index 0d3c8ab17..dfc3d9086 100644 --- a/structfbgemm_1_1simd__info.html +++ b/structfbgemm_1_1simd__info.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1thread__type__t-members.html b/structfbgemm_1_1thread__type__t-members.html index 89f376656..3286142fc 100644 --- a/structfbgemm_1_1thread__type__t-members.html +++ b/structfbgemm_1_1thread__type__t-members.html @@ -71,7 +71,7 @@
    diff --git a/structfbgemm_1_1thread__type__t.html b/structfbgemm_1_1thread__type__t.html index acdef2d44..73c3d3d4e 100644 --- a/structfbgemm_1_1thread__type__t.html +++ b/structfbgemm_1_1thread__type__t.html @@ -71,7 +71,7 @@
    diff --git a/topics.html b/topics.html index 06a4f1885..4d0467513 100644 --- a/topics.html +++ b/topics.html @@ -86,16 +86,17 @@
    - - - - - - - - - - + + + + + + + + + + +
    Tinput type
     CUDA Operators
     Embedding CUDA Operators
     Embedding CPU Operators
     Example Method Group
     Jagged Tensor CUDA Operators
     Jagged Tensor Operators
     Layout Transformation CUDA Operators
     Layout Transformation CPU Operators
     Quantization Operators (CUDA)
     Quantize Data CPU Operators
     Quantization Utilities (Generic)
     Quantization Utilities (AVX2)
     Quantization Utilities (AVX512)
     Experimental-gen-ai-attention
     Example Method Group
     Jagged Tensor CUDA Operators
     Jagged Tensor Operators
     Layout Transformation CUDA Operators
     Layout Transformation CPU Operators
     Quantization Operators (CUDA)
     Quantize Data CPU Operators
     Quantization Utilities (Generic)
     Quantization Utilities (AVX2)
     Quantization Utilities (AVX512)