From 99c007249d27e023ea2ff3f5f95a04f56f6d33c8 Mon Sep 17 00:00:00 2001 From: Florian Treml Date: Fri, 21 Jan 2022 17:43:28 +0100 Subject: [PATCH] Added support for AWS Transcribe Streaming --- frontend/package.json | 1 + frontend/src/routes.js | 9 +- frontend/src/stt/awstranscribe.js | 133 ++++++++++++++++++++++++++++++ frontend/src/swagger.json | 9 +- frontend/src/utils.js | 18 ++++ 5 files changed, 163 insertions(+), 7 deletions(-) create mode 100644 frontend/src/stt/awstranscribe.js diff --git a/frontend/package.json b/frontend/package.json index 8b455f7..791fbe4 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -14,6 +14,7 @@ "license": "MIT", "dependencies": { "@aws-sdk/client-polly": "^3.47.1", + "@aws-sdk/client-transcribe-streaming": "^3.47.1", "@google-cloud/speech": "^4.10.0", "@google-cloud/storage": "^5.18.0", "@google-cloud/text-to-speech": "^3.4.0", diff --git a/frontend/src/routes.js b/frontend/src/routes.js index 98bd7ec..a2c7bda 100644 --- a/frontend/src/routes.js +++ b/frontend/src/routes.js @@ -46,7 +46,8 @@ const sttEngines = { google: new (require('./stt/google'))(), kaldi: new (require('./stt/kaldi'))(), ibm: new (require('./stt/ibm'))(), - azure: new (require('./stt/azure'))() + azure: new (require('./stt/azure'))(), + awstranscribe: new (require('./stt/awstranscribe'))() } const multerMemoryStorage = multer.memoryStorage() @@ -124,7 +125,7 @@ const router = express.Router() * required: false * schema: * type: string - * enum: [kaldi, google, ibm, azure] + * enum: [kaldi, google, ibm, azure, awstranscribe] * responses: * 200: * description: List of supported STT languages @@ -176,7 +177,7 @@ const router = express.Router() * required: false * schema: * type: string - * enum: [kaldi, google, ibm, azure] + * enum: [kaldi, google, ibm, azure, awstranscribe] * - name: cache * description: Use result cache (default Y) * in: query @@ -705,7 +706,7 @@ const wssStreams = {} * required: false * schema: * type: string - * enum: [kaldi, google, ibm, azure] + * enum: [kaldi, google, ibm, azure, awstranscribe] * responses: * 200: * description: Websocket Url to stream the audio to, and the uri to check status and end the stream diff --git a/frontend/src/stt/awstranscribe.js b/frontend/src/stt/awstranscribe.js new file mode 100644 index 0000000..00b0136 --- /dev/null +++ b/frontend/src/stt/awstranscribe.js @@ -0,0 +1,133 @@ +const _ = require('lodash') +const { TranscribeStreamingClient, StartStreamTranscriptionCommand } = require('@aws-sdk/client-transcribe-streaming') +const { PassThrough } = require('stream') +const EventEmitter = require('events') + +const debug = require('debug')('botium-speech-processing-awstranscribe-stt') + +const { awstranscribeOptions } = require('../utils') + +const languageCodes = [ + 'af-ZA', + 'ar-AE', + 'ar-SA', + 'zh-CN', + 'zh-TW', + 'da-DK', + 'nl-NL', + 'en-AU', + 'en-GB', + 'en-IN', + 'en-IE', + 'en-NZ', + 'en-AB', + 'en-ZA', + 'en-US', + 'en-WL', + 'fr-FR', + 'fr-CA', + 'fa-IR', + 'de-DE', + 'de-CH', + 'he-IL', + 'hi-IN', + 'id-ID', + 'it-IT', + 'ja-JP', + 'ko-KR', + 'ms-MY', + 'pt-PT', + 'pt-BR', + 'ru-RU', + 'es-ES', + 'es-US', + 'ta-IN', + 'te-IN', + 'th-TH', + 'tr-TR' +].sort() + +class AwsTranscribeSTT { + async languages (req) { + return languageCodes + } + + async stt_OpenStream (req, { language }) { + const transcribeClient = new TranscribeStreamingClient(awstranscribeOptions(req)) + + let audioInputStream = new PassThrough() + const audioStream = async function * () { + for await (const payloadChunk of audioInputStream) { + const chunks = _.chunk(payloadChunk, 25000) + for (const chunk of chunks) { + yield { AudioEvent: { AudioChunk: Buffer.from(chunk) } } + } + } + } + + const request = { + LanguageCode: language, + MediaEncoding: 'pcm', + MediaSampleRateHertz: 16000, + AudioStream: audioStream() + } + if (req.body && req.body.awstranscribe && req.body.awstranscribe.config) { + Object.assign(request, req.body.awstranscribe.config) + } + + const events = new EventEmitter() + try { + const cmdResponse = await transcribeClient.send(new StartStreamTranscriptionCommand(request)) + setTimeout(async () => { + try { + for await (const event of cmdResponse.TranscriptResultStream) { + const results = _.get(event, 'TranscriptEvent.Transcript.Results') + if (results && results.length > 0) { + for (const result of results) { + const event = { + text: result.Alternatives[0].Transcript, + final: !result.IsPartial, + start: result.StartTime, + end: result.EndTime, + debug: result + } + events.emit('data', event) + } + } + } + } catch (err) { + events.emit('data', { + err: `${err.message}` + }) + } + events.emit('close') + }, 0) + } catch (err) { + debug(err) + throw new Error(`AWS Transcribe STT streaming failed: ${err.message}`) + } + return { + events, + write: (buffer) => { + audioInputStream.push(buffer) + }, + end: () => { + if (audioInputStream) { + audioInputStream.end() + } + }, + close: () => { + if (audioInputStream) { + audioInputStream.destroy() + } + audioInputStream = null + } + } + } + + async stt (req, { language, buffer, hint }) { + + } +} + +module.exports = AwsTranscribeSTT diff --git a/frontend/src/swagger.json b/frontend/src/swagger.json index 88ff99e..99df11a 100644 --- a/frontend/src/swagger.json +++ b/frontend/src/swagger.json @@ -59,7 +59,8 @@ "kaldi", "google", "ibm", - "azure" + "azure", + "awstranscribe" ] } } @@ -127,7 +128,8 @@ "kaldi", "google", "ibm", - "azure" + "azure", + "awstranscribe" ] } }, @@ -605,7 +607,8 @@ "kaldi", "google", "ibm", - "azure" + "azure", + "awstranscribe" ] } } diff --git a/frontend/src/utils.js b/frontend/src/utils.js index 1f9583f..5324211 100644 --- a/frontend/src/utils.js +++ b/frontend/src/utils.js @@ -100,6 +100,23 @@ const pollyOptions = (req) => { throw new Error('AWS Polly credentials not found') } +const awstranscribeOptions = (req) => { + const region = _.get(req, 'body.awstranscribe.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION + const accessKeyId = _.get(req, 'body.awstranscribe.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID + const secretAccessKey = _.get(req, 'body.awstranscribe.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY + + if (region && accessKeyId && secretAccessKey) { + return { + region, + credentials: { + accessKeyId, + secretAccessKey + } + } + } + throw new Error('AWS Transcribe credentials not found') +} + const azureSpeechConfig = (req) => { const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION @@ -158,6 +175,7 @@ module.exports = { ibmSttOptions, ibmTtsOptions, pollyOptions, + awstranscribeOptions, azureSpeechConfig, applyExtraAzureSpeechConfig, getAzureErrorDetails,