From 99c007249d27e023ea2ff3f5f95a04f56f6d33c8 Mon Sep 17 00:00:00 2001
From: Florian Treml <florian.treml@botium.at>
Date: Fri, 21 Jan 2022 17:43:28 +0100
Subject: [PATCH] Added support for AWS Transcribe Streaming

---
 frontend/package.json             |   1 +
 frontend/src/routes.js            |   9 +-
 frontend/src/stt/awstranscribe.js | 133 ++++++++++++++++++++++++++++++
 frontend/src/swagger.json         |   9 +-
 frontend/src/utils.js             |  18 ++++
 5 files changed, 163 insertions(+), 7 deletions(-)
 create mode 100644 frontend/src/stt/awstranscribe.js

diff --git a/frontend/package.json b/frontend/package.json
index 8b455f7..791fbe4 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -14,6 +14,7 @@
   "license": "MIT",
   "dependencies": {
     "@aws-sdk/client-polly": "^3.47.1",
+    "@aws-sdk/client-transcribe-streaming": "^3.47.1",
     "@google-cloud/speech": "^4.10.0",
     "@google-cloud/storage": "^5.18.0",
     "@google-cloud/text-to-speech": "^3.4.0",
diff --git a/frontend/src/routes.js b/frontend/src/routes.js
index 98bd7ec..a2c7bda 100644
--- a/frontend/src/routes.js
+++ b/frontend/src/routes.js
@@ -46,7 +46,8 @@ const sttEngines = {
   google: new (require('./stt/google'))(),
   kaldi: new (require('./stt/kaldi'))(),
   ibm: new (require('./stt/ibm'))(),
-  azure: new (require('./stt/azure'))()
+  azure: new (require('./stt/azure'))(),
+  awstranscribe: new (require('./stt/awstranscribe'))()
 }
 
 const multerMemoryStorage = multer.memoryStorage()
@@ -124,7 +125,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *     responses:
  *       200:
  *         description: List of supported STT languages
@@ -176,7 +177,7 @@ const router = express.Router()
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *       - name: cache
  *         description: Use result cache (default Y)
  *         in: query
@@ -705,7 +706,7 @@ const wssStreams = {}
  *         required: false
  *         schema:
  *           type: string
- *           enum: [kaldi, google, ibm, azure]
+ *           enum: [kaldi, google, ibm, azure, awstranscribe]
  *     responses:
  *       200:
  *         description: Websocket Url to stream the audio to, and the uri to check status and end the stream
diff --git a/frontend/src/stt/awstranscribe.js b/frontend/src/stt/awstranscribe.js
new file mode 100644
index 0000000..00b0136
--- /dev/null
+++ b/frontend/src/stt/awstranscribe.js
@@ -0,0 +1,133 @@
+const _ = require('lodash')
+const { TranscribeStreamingClient, StartStreamTranscriptionCommand } = require('@aws-sdk/client-transcribe-streaming')
+const { PassThrough } = require('stream')
+const EventEmitter = require('events')
+
+const debug = require('debug')('botium-speech-processing-awstranscribe-stt')
+
+const { awstranscribeOptions } = require('../utils')
+
+const languageCodes = [
+  'af-ZA',
+  'ar-AE',
+  'ar-SA',
+  'zh-CN',
+  'zh-TW',
+  'da-DK',
+  'nl-NL',
+  'en-AU',
+  'en-GB',
+  'en-IN',
+  'en-IE',
+  'en-NZ',
+  'en-AB',
+  'en-ZA',
+  'en-US',
+  'en-WL',
+  'fr-FR',
+  'fr-CA',
+  'fa-IR',
+  'de-DE',
+  'de-CH',
+  'he-IL',
+  'hi-IN',
+  'id-ID',
+  'it-IT',
+  'ja-JP',
+  'ko-KR',
+  'ms-MY',
+  'pt-PT',
+  'pt-BR',
+  'ru-RU',
+  'es-ES',
+  'es-US',
+  'ta-IN',
+  'te-IN',
+  'th-TH',
+  'tr-TR'
+].sort()
+
+class AwsTranscribeSTT {
+  async languages (req) {
+    return languageCodes
+  }
+
+  async stt_OpenStream (req, { language }) {
+    const transcribeClient = new TranscribeStreamingClient(awstranscribeOptions(req))
+
+    let audioInputStream = new PassThrough()
+    const audioStream = async function * () {
+      for await (const payloadChunk of audioInputStream) {
+        const chunks = _.chunk(payloadChunk, 25000)
+        for (const chunk of chunks) {
+          yield { AudioEvent: { AudioChunk: Buffer.from(chunk) } }
+        }
+      }
+    }
+
+    const request = {
+      LanguageCode: language,
+      MediaEncoding: 'pcm',
+      MediaSampleRateHertz: 16000,
+      AudioStream: audioStream()
+    }
+    if (req.body && req.body.awstranscribe && req.body.awstranscribe.config) {
+      Object.assign(request, req.body.awstranscribe.config)
+    }
+
+    const events = new EventEmitter()
+    try {
+      const cmdResponse = await transcribeClient.send(new StartStreamTranscriptionCommand(request))
+      setTimeout(async () => {
+        try {
+          for await (const event of cmdResponse.TranscriptResultStream) {
+            const results = _.get(event, 'TranscriptEvent.Transcript.Results')
+            if (results && results.length > 0) {
+              for (const result of results) {
+                const event = {
+                  text: result.Alternatives[0].Transcript,
+                  final: !result.IsPartial,
+                  start: result.StartTime,
+                  end: result.EndTime,
+                  debug: result
+                }
+                events.emit('data', event)
+              }
+            }
+          }
+        } catch (err) {
+          events.emit('data', {
+            err: `${err.message}`
+          })
+        }
+        events.emit('close')
+      }, 0)
+    } catch (err) {
+      debug(err)
+      throw new Error(`AWS Transcribe STT streaming failed: ${err.message}`)
+    }
+    return {
+      events,
+      write: (buffer) => {
+        audioInputStream.push(buffer)
+      },
+      end: () => {
+        if (audioInputStream) {
+          audioInputStream.end()
+        }
+      },
+      close: () => {
+        if (audioInputStream) {
+          audioInputStream.destroy()
+        }
+        audioInputStream = null
+      }
+    }
+  }
+
+  async stt (req, { language, buffer, hint }) {
+
+  }
+}
+
+module.exports = AwsTranscribeSTT
diff --git a/frontend/src/swagger.json b/frontend/src/swagger.json
index 88ff99e..99df11a 100644
--- a/frontend/src/swagger.json
+++ b/frontend/src/swagger.json
@@ -59,7 +59,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           }
@@ -127,7 +128,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           },
@@ -605,7 +607,8 @@
                 "kaldi",
                 "google",
                 "ibm",
-                "azure"
+                "azure",
+                "awstranscribe"
               ]
             }
           }
diff --git a/frontend/src/utils.js b/frontend/src/utils.js
index 1f9583f..5324211 100644
--- a/frontend/src/utils.js
+++ b/frontend/src/utils.js
@@ -100,6 +100,23 @@ const pollyOptions = (req) => {
   throw new Error('AWS Polly credentials not found')
 }
 
+const awstranscribeOptions = (req) => {
+  const region = _.get(req, 'body.awstranscribe.credentials.region') || process.env.BOTIUM_SPEECH_AWS_REGION
+  const accessKeyId = _.get(req, 'body.awstranscribe.credentials.accessKeyId') || process.env.BOTIUM_SPEECH_AWS_ACCESS_KEY_ID
+  const secretAccessKey = _.get(req, 'body.awstranscribe.credentials.secretAccessKey') || process.env.BOTIUM_SPEECH_AWS_SECRET_ACCESS_KEY
+
+  if (region && accessKeyId && secretAccessKey) {
+    return {
+      region,
+      credentials: {
+        accessKeyId,
+        secretAccessKey
+      }
+    }
+  }
+  throw new Error('AWS Transcribe credentials not found')
+}
+
 const azureSpeechConfig = (req) => {
   const subscriptionKey = _.get(req, 'body.azure.credentials.subscriptionKey') || process.env.BOTIUM_SPEECH_AZURE_SUBSCRIPTION_KEY
   const region = _.get(req, 'body.azure.credentials.region') || process.env.BOTIUM_SPEECH_AZURE_REGION
@@ -158,6 +175,7 @@ module.exports = {
   ibmSttOptions,
   ibmTtsOptions,
   pollyOptions,
+  awstranscribeOptions,
   azureSpeechConfig,
   applyExtraAzureSpeechConfig,
   getAzureErrorDetails,