diff --git a/components/nn/include/maix_nn_speech.hpp b/components/nn/include/maix_nn_speech.hpp index 3da7ae7..bd6663b 100644 --- a/components/nn/include/maix_nn_speech.hpp +++ b/components/nn/include/maix_nn_speech.hpp @@ -19,6 +19,7 @@ static std::function _digit_callback; static std::function, int)> _kws_callback; static std::function, int)> _lvcsr_callback; +#ifdef PLATFORM_MAIXCAM namespace maix::nn { @@ -623,3 +624,341 @@ enum SpeechDecoder { }; } // namespace maix::nn +#endif + +#ifdef PLATFORM_LINUX +namespace maix::nn +{ + +/** + * @brief speech device + * @maixpy maix.nn.SpeechDevice + */ +enum SpeechDevice { + DEVICE_NONE = -1, + DEVICE_PCM, + DEVICE_MIC, + DEVICE_WAV, +}; + +/** + * @brief speech decoder type + * @maixpy maix.nn.SpeechDecoder + */ +enum SpeechDecoder { + DECODER_RAW = 1, + DECODER_DIG = 2, + DECODER_LVCSR = 4, + DECODER_KWS = 8, + DECODER_ALL = 65535, +}; + + /** + * Speech + * @maixpy maix.nn.Speech + */ + class Speech + { + public: + /** + * Construct a new Speech object + * @param model model path, default empty, you can load model later by load function. + * @throw If model arg is not empty and load failed, will throw err::Exception. + * @maixpy maix.nn.Speech.__init__ + * @maixcdk maix.nn.Speech.Speech + */ + Speech(const string &model = "") + { + + } + + ~Speech() + { + + } + + /** + * Load model from file + * @param model Model path want to load + * @return err::Err + * @maixpy maix.nn.Speech.load + */ + err::Err load(const string &model) + { + return err::ERR_NONE; + } + + /** + * Init the ASR library and select the type and name of the audio device. + * @param dev_type device type want to detect, can choose between WAV, PCM, or MIC. + * @param device_name device name want to detect, can choose a WAV file, a PCM file, or a MIC device name. + * @throw If am model is not loaded, will throw err::ERR_NOT_IMPL. + * @throw If device is not supported, will throw err::ERR_NOT_IMPL. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.init + */ + err::Err init(nn::SpeechDevice dev_type, const string &device_name) + { + return err::ERR_NONE; + } + + /** + * Reset the device, usually used for PCM/WAV recognition, + * such as identifying the next WAV file. + * @param dev_type device type want to detect, can choose between WAV, PCM, or MIC. + * @param device_name device name want to detect, can choose a WAV file, a PCM file, or a MIC device name. + * @throw If device is not supported, will throw err::ERR_NOT_IMPL. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.devive + */ + err::Err devive(nn::SpeechDevice dev_type, const string &device_name) + { + return err::ERR_NONE; + } + + /** + * Deinit the ASR library. + * @maixpy maix.nn.Speech.deinit + */ + void deinit() + { + _dev_type = DEVICE_NONE; + _decoder_raw = false; + _decoder_dig = false; + _decoder_lvcsr = false; + _decoder_kws = false; + } + + /** + * Deinit the decoder. + * @param decoder decoder type want to deinit + * can choose between DECODER_RAW, DECODER_DIG, DECODER_LVCSR, DECODER_KWS or DECODER_ALL. + * @throw If device is not supported, will throw err::ERR_NOT_IMPL. + * @maixpy maix.nn.Speech.deinit + */ + void deinit(nn::SpeechDecoder decoder) + { + + } + + /** + * Init raw decoder, it will output the prediction results of the original AM. + * @param callback raw decoder user callback. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.raw + */ + err::Err raw(std::function, int)> callback) + { + return err::ERR_NONE; + } + + /** + * Get raw decoder status + * @return bool, raw decoder status + * @maixpy maix.nn.Speech.raw + */ + bool raw() { return _decoder_raw; } + + /** + * Init digit decoder, it will output the Chinese digit recognition results within the last 4 seconds. + * @param blank If it exceeds this value, insert a '_' in the output result to indicate idle mute. + * @param callback digit decoder user callback. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.digit + */ + err::Err digit(int blank, std::function callback) + { + return err::ERR_NONE; + } + + /** + * Get digit decoder status + * @return bool, digit decoder status + * @maixpy maix.nn.Speech.digit + */ + bool digit() { return _decoder_dig; } + + /** + * Init kws decoder, it will output a probability list of all registered keywords in the latest frame, + * users can set their own thresholds for wake-up. + * @param kw_tbl Keyword list, filled in with spaces separated by pinyin, for example: xiao3 ai4 tong2 xue2 + * @param kw_gate kw_gate, keyword probability gate table, the number should be the same as kw_tbl + * @param auto_similar Whether to perform automatic homophone processing, + * setting it to true will automatically calculate the probability by using pinyin with different tones as homophones + * @param callback digit decoder user callback. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.kws + */ + err::Err kws(std::vector kw_tbl, std::vector kw_gate, std::function, int)> callback, bool auto_similar = true) + { + return err::ERR_NONE; + } + + /** + * Get kws decoder status + * @return bool, kws decoder status + * @maixpy maix.nn.Speech.kws + */ + bool kws() { return _decoder_kws; } + + /** + * Init lvcsr decoder, it will output continuous speech recognition results (less than 1024 Chinese characters). + * @param sfst_name Sfst file path. + * @param sym_name Sym file path (output symbol table). + * @param phones_txt Path to phones.bin (pinyin table). + * @param words_txt Path to words.bin (dictionary table). + * @param callback lvcsr decoder user callback. + * @param beam The beam size for WFST search is set to 8 by default, and it is recommended to be between 3 and 9. + * The larger the size, the larger the search space, and the more accurate but slower the search. + * @param bg_prob The absolute value of the natural logarithm of the default probability value for background pinyin + * outside of BEAM-CNT is set to 10 by default. + * @param scale acoustics_cost = log(pny_prob)*scale. + * @param mmap use mmap to load the WFST decoding image, + * If set to true, the beam should be less than 5. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.lvcsr + */ + err::Err lvcsr(const string &sfst_name, const string &sym_name, + const string &phones_txt, const string &words_txt, + std::function, int)> callback, + float beam = 8, float bg_prob = 10, float scale = 0.5, bool mmap = false) + { + return err::ERR_NONE; + } + + /** + * Get lvcsr decoder status + * @return bool, lvcsr decoder status + * @maixpy maix.nn.Speech.lvcsr + */ + bool lvcsr() { return _decoder_lvcsr; } + + /** + * Run speech recognition, user can run 1 frame at a time and do other processing after running, + * or it can run continuously within a thread and be stopped by an external thread. + * @param frame The number of frames per run. + * @return int type, return actual number of frames in the run. + * @maixpy maix.nn.Speech.run + */ + int run(int frame) + { + return 0; + } + + /** + * Reset internal cache operation + * @maixpy maix.nn.Speech.clear + */ + void clear() + { + + } + + /** + * Get the time of one frame. + * @return int type, return the time of one frame. + * @maixpy maix.nn.Speech.frame_time + */ + int frame_time() + { + return 0; + } + + /** + * Get the acoustic model dictionary. + * @return std::pair type, return the dictionary and length. + * @maixpy maix.nn.Speech.vocab + */ + std::pair vocab() + { + return {"", 0}; + } + + /** + * Manually register mute words, and each pinyin can register up to 10 homophones, + * please note that using this interface to register homophones will overwrite, + * the homophone table automatically generated in the "automatic homophone processing" feature. + * @param dev_type device type want to detect, can choose between WAV, PCM, or MIC. + * @param device_name device name want to detect, can choose a WAV file, a PCM file, or a MIC device name. + * @return err::Err type, if init success, return err::ERR_NONE + * @maixpy maix.nn.Speech.similar + */ + err::Err similar(const string &pny, std::vector similar_pnys) + { + return err::ERR_NONE; + } + + public: + /** + * Get mean value, list type + * @maixpy maix.nn.Speech.mean + */ + std::vector mean; + + /** + * Get scale value, list type + * @maixpy maix.nn.Speech.scale + */ + std::vector scale; + + /** + * get device type + * @return nn::SpeechDevice type, see SpeechDevice of this module + * @maixpy maix.nn.Speech.dev_type + */ + nn::SpeechDevice dev_type() { return _dev_type; } + + private: + nn::NN *_model; + std::string _model_path = ""; + std::map _extra_info; + image::Size _input_size; + std::vector _inputs; + nn::SpeechDevice _dev_type = DEVICE_NONE; + bool _decoder_raw = false; + bool _decoder_dig = false; + bool _decoder_kws = false; + bool _decoder_lvcsr = false; + + static void digit_callback_wrapper(void* data, int cnt) { + + } + + static void kws_callback_wrapper(void* data, int cnt) { + + } + + static void raw_callback_wrapper(void* data, int cnt) { + + } + + static void lvcsr_callback_wrapper(void* data, int cnt) { + + } + + static void split0(std::vector &items, const std::string &s, const std::string &delimiter) + { + items.clear(); + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) + { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + items.push_back(token); + } + + items.push_back(s.substr(pos_start)); + } + + static std::vector split(const std::string &s, const std::string &delimiter) + { + std::vector tokens; + split0(tokens, s, delimiter); + return tokens; + } + }; + +} // namespace maix::nn +#endif