Skip to content

Commit

Permalink
Added support for AWS Transcribe Streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
Florian Treml committed Jan 21, 2022
1 parent 96f47b5 commit 99c0072
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 7 deletions.
1 change: 1 addition & 0 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"license": "MIT",
"dependencies": {
"@aws-sdk/client-polly": "^3.47.1",
"@aws-sdk/client-transcribe-streaming": "^3.47.1",
"@google-cloud/speech": "^4.10.0",
"@google-cloud/storage": "^5.18.0",
"@google-cloud/text-to-speech": "^3.4.0",
Expand Down
9 changes: 5 additions & 4 deletions frontend/src/routes.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ const sttEngines = {
google: new (require('./stt/google'))(),
kaldi: new (require('./stt/kaldi'))(),
ibm: new (require('./stt/ibm'))(),
azure: new (require('./stt/azure'))()
azure: new (require('./stt/azure'))(),
awstranscribe: new (require('./stt/awstranscribe'))()
}

const multerMemoryStorage = multer.memoryStorage()
Expand Down Expand Up @@ -124,7 +125,7 @@ const router = express.Router()
* required: false
* schema:
* type: string
* enum: [kaldi, google, ibm, azure]
* enum: [kaldi, google, ibm, azure, awstranscribe]
* responses:
* 200:
* description: List of supported STT languages
Expand Down Expand Up @@ -176,7 +177,7 @@ const router = express.Router()
* required: false
* schema:
* type: string
* enum: [kaldi, google, ibm, azure]
* enum: [kaldi, google, ibm, azure, awstranscribe]
* - name: cache
* description: Use result cache (default Y)
* in: query
Expand Down Expand Up @@ -705,7 +706,7 @@ const wssStreams = {}
* required: false
* schema:
* type: string
* enum: [kaldi, google, ibm, azure]
* enum: [kaldi, google, ibm, azure, awstranscribe]
* responses:
* 200:
* description: Websocket Url to stream the audio to, and the uri to check status and end the stream
Expand Down
133 changes: 133 additions & 0 deletions frontend/src/stt/awstranscribe.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
const _ = require('lodash')
const { TranscribeStreamingClient, StartStreamTranscriptionCommand } = require('@aws-sdk/client-transcribe-streaming')
const { PassThrough } = require('stream')
const EventEmitter = require('events')

const debug = require('debug')('botium-speech-processing-awstranscribe-stt')

const { awstranscribeOptions } = require('../utils')

const languageCodes = [
'af-ZA',
'ar-AE',
'ar-SA',
'zh-CN',
'zh-TW',
'da-DK',
'nl-NL',
'en-AU',
'en-GB',
'en-IN',
'en-IE',
'en-NZ',
'en-AB',
'en-ZA',
'en-US',
'en-WL',
'fr-FR',
'fr-CA',
'fa-IR',
'de-DE',
'de-CH',
'he-IL',
'hi-IN',
'id-ID',
'it-IT',
'ja-JP',
'ko-KR',
'ms-MY',
'pt-PT',
'pt-BR',
'ru-RU',
'es-ES',
'es-US',
'ta-IN',
'te-IN',
'th-TH',
'tr-TR'
].sort()

class AwsTranscribeSTT {
async languages (req) {
return languageCodes
}

async stt_OpenStream (req, { language }) {
const transcribeClient = new TranscribeStreamingClient(awstranscribeOptions(req))

let audioInputStream = new PassThrough()
const audioStream = async function * () {
for await (const payloadChunk of audioInputStream) {
const chunks = _.chunk(payloadChunk, 25000)
for (const chunk of chunks) {
yield { AudioEvent: { AudioChunk: Buffer.from(chunk) } }
}
}
}

const request = {
LanguageCode: language,
MediaEncoding: 'pcm',
MediaSampleRateHertz: 16000,
AudioStream: audioStream()
}
if (req.body && req.body.awstranscribe && req.body.awstranscribe.config) {
Object.assign(request, req.body.awstranscribe.config)
}

const events = new EventEmitter()
try {
const cmdResponse = await transcribeClient.send(new StartStreamTranscriptionCommand(request))
setTimeout(async () => {
try {
for await (const event of cmdResponse.TranscriptResultStream) {
const results = _.get(event, 'TranscriptEvent.Transcript.Results')
if (results && results.length > 0) {
for (const result of results) {
const event = {
text: result.Alternatives[0].Transcript,
final: !result.IsPartial,
start: result.StartTime,
end: result.EndTime,
debug: result
}
events.emit('data', event)
}
}
}
} catch (err) {
events.emit('data', {
err: `${err.message}`
})
}
events.emit('close')
}, 0)
} catch (err) {
debug(err)
throw new Error(`AWS Transcribe STT streaming failed: ${err.message}`)
}
return {
events,
write: (buffer) => {
audioInputStream.push(buffer)
},
end: () => {
if (audioInputStream) {
audioInputStream.end()
}
},
close: () => {
if (audioInputStream) {
audioInputStream.destroy()
}
audioInputStream = null
}
}
}

async stt (req, { language, buffer, hint }) {

}
}

module.exports = AwsTranscribeSTT
9 changes: 6 additions & 3 deletions frontend/src/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@
"kaldi",
"google",
"ibm",
"azure"
"azure",
"awstranscribe"
]
}
}
Expand Down Expand Up @@ -127,7 +128,8 @@
"kaldi",
"google",
"ibm",
"azure"
"azure",
"awstranscribe"
]
}
},
Expand Down Expand Up @@ -605,7 +607,8 @@
"kaldi",
"google",
"ibm",
"azure"
"azure",
"awstranscribe"
]
}
}
Expand Down
18 changes: 18 additions & 0 deletions frontend/src/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,23 @@ const pollyOptions = (req) => {
throw new Error('AWS Polly credentials not found')
}

const awstranscribeOptions = (req) => {
const region = _.get(req, 'body.awstranscribe.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
const accessKeyId = _.get(req, 'body.awstranscribe.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
const secretAccessKey = _.get(req, 'body.awstranscribe.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY

if (region && accessKeyId && secretAccessKey) {
return {
region,
credentials: {
accessKeyId,
secretAccessKey
}
}
}
throw new Error('AWS Transcribe credentials not found')
}

const azureSpeechConfig = (req) => {
const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
Expand Down Expand Up @@ -158,6 +175,7 @@ module.exports = {
ibmSttOptions,
ibmTtsOptions,
pollyOptions,
awstranscribeOptions,
azureSpeechConfig,
applyExtraAzureSpeechConfig,
getAzureErrorDetails,
Expand Down

0 comments on commit 99c0072

Please sign in to comment.