From 91e5d06891f1f883644ec4dc96fcc9f7b1b07c85 Mon Sep 17 00:00:00 2001
From: Markus Ast <m@rkusa.st>
Date: Fri, 24 May 2024 16:39:22 +0200
Subject: [PATCH] Add Piper as TTS provider

---
 README.md                     | 18 +++++++++++
 lua/DCS-gRPC/grpc-mission.lua |  2 +-
 lua/Hooks/DCS-gRPC.lua        |  2 +-
 protos/dcs/srs/v0/srs.proto   | 12 ++++++++
 src/config.rs                 |  9 ++++++
 src/rpc/srs.rs                | 36 +++++++++++++++++++++-
 src/server.rs                 |  6 ++++
 tts/src/lib.rs                |  4 +++
 tts/src/piper.rs              | 57 +++++++++++++++++++++++++++++++++++
 9 files changed, 143 insertions(+), 3 deletions(-)
 create mode 100644 tts/src/piper.rs

diff --git a/README.md b/README.md
index 7b7a18f0..632ad9fa 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,12 @@ tts.provider.gcloud.defaultVoice = "en-GB-Neural2-A"
 -- Requires at least Windows Server 2019 to work properly.
 tts.provider.win.defaultVoice = "David"
 
+-- The default Piper language model to use (must be installed manually).
+tts.provider.piper.defaultVoice = "..."
+
+-- The default Piper speech speed (1.0 is the default; lower is quicker, higher is slower).
+tts.provider.piper.defaultSpeed = 1.0
+
 -- Your SRS server's address.
 srs.addr = "127.0.0.1:5002"
 ```
@@ -167,6 +173,17 @@ You can also check for the present of a `\Logs\grpc.log` file.
 
 The server will be running on port 50051 by default.
 
+## Install Piper TTS
+
+This is only necessary if you plan to use Piper as your TTS provider.
+
+1. Download `piper_windows_amd64.zip` from the latest [Piper release](https://github.com/rhasspy/piper/releases).
+2. Extract the `piper` directory from this zip file and place it at `DCS.openbeta\Mods\tech\DCS-gRPC\piper`.
+3. Download at least one voice from [Piper Voices](https://github.com/rhasspy/piper/blob/master/VOICES.md). You need both the `model` and the `config`. For the SRS voice quality, a `low` model is sufficient.
+4. Place the model and config into your `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory (e.g. `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx` and `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx.json`.
+5. Set one of your installed voices as the default voice in your config (`tts.provider.piper.defaultVoice = "..."`, e.g. `tts.provider.piper.defaultVoice = "en_US-amy-low.onnx"`).
+6. If you want to use Piper, don't forget to set it as your default provider, or enable it on a per-transmission basis.
+
 ## Lua API
 
 `DCS-gRPC` provides the following Lua APIs to interact with the server from within Lua.
@@ -202,6 +219,7 @@ The server will be running on port 50051 by default.
         -- `= { azure = {} }` / `= { azure = { voice = "..." } }` enable Azure TTS
         -- `= { gcloud = {} }` / `= { gcloud = { voice = "..." } }` enable Google Cloud TTS
         -- `= { win = {} }` / `= { win = { voice = "..." } }` enable Windows TTS
+        -- `= { piper = {} }` / `= { piper = { voice = "...", speed = 1.0 } }` enable Piper TTS
         provider = null,
     }
     ```
diff --git a/lua/DCS-gRPC/grpc-mission.lua b/lua/DCS-gRPC/grpc-mission.lua
index da41487e..90da4159 100644
--- a/lua/DCS-gRPC/grpc-mission.lua
+++ b/lua/DCS-gRPC/grpc-mission.lua
@@ -1,7 +1,7 @@
 if not GRPC then
   GRPC = {
     -- scaffold nested tables to allow direct assignment in config file
-    tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } },
+    tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } },
     srs = {},
   }
 end
diff --git a/lua/Hooks/DCS-gRPC.lua b/lua/Hooks/DCS-gRPC.lua
index c14e8a97..550f84d7 100644
--- a/lua/Hooks/DCS-gRPC.lua
+++ b/lua/Hooks/DCS-gRPC.lua
@@ -7,7 +7,7 @@ local function init()
   if not GRPC then
     _G.GRPC = {
       -- scaffold nested tables to allow direct assignment in config file
-      tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } },
+      tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } },
       srs = {},
     }
   end
diff --git a/protos/dcs/srs/v0/srs.proto b/protos/dcs/srs/v0/srs.proto
index c813e7b0..093c6fd6 100644
--- a/protos/dcs/srs/v0/srs.proto
+++ b/protos/dcs/srs/v0/srs.proto
@@ -75,6 +75,16 @@ message TransmitRequest {
     optional string voice = 1;
   }
 
+  message Piper {
+    // The voice model the text is synthesized in (corresponds to a model placed in your
+    // `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory).
+    optional string voice = 1;
+
+    // The speed of the generated speech; 1.0 is the default; lower is quicker, higher is slower.
+    optional float speed = 2;
+  }
+
+
   // Optional TTS provider to be use. Defaults to the one configured in your
   // config or to Windows' built-in TTS.
   oneof provider {
@@ -82,6 +92,8 @@ message TransmitRequest {
     Azure azure = 9;
     GCloud gcloud = 10;
     Windows win = 11;
+    // Piper does not support SSML, only use it with plain text.
+    Piper piper = 12;
   }
 }
 
diff --git a/src/config.rs b/src/config.rs
index f626090b..027d3860 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -38,6 +38,7 @@ pub struct TtsProviderConfig {
     pub azure: Option<AzureConfig>,
     pub gcloud: Option<GCloudConfig>,
     pub win: Option<WinConfig>,
+    pub piper: Option<PiperConfig>,
 }
 
 #[derive(Debug, Clone, Default, Deserialize, Serialize)]
@@ -48,6 +49,7 @@ pub enum TtsProvider {
     GCloud,
     #[default]
     Win,
+    Piper,
 }
 
 #[derive(Clone, Deserialize, Serialize)]
@@ -80,6 +82,13 @@ pub struct WinConfig {
     pub default_voice: Option<String>,
 }
 
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct PiperConfig {
+    pub default_voice: Option<String>,
+    pub default_speed: Option<f32>,
+}
+
 #[derive(Debug, Clone, Default, Deserialize, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SrsConfig {
diff --git a/src/rpc/srs.rs b/src/rpc/srs.rs
index cdd0782a..d746ca57 100644
--- a/src/rpc/srs.rs
+++ b/src/rpc/srs.rs
@@ -1,13 +1,14 @@
 use std::error;
 use std::future::Future;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::path::PathBuf;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
 use ::srs::Sender;
 #[cfg(target_os = "windows")]
 use ::tts::WinConfig;
-use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, TtsConfig};
+use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, PiperConfig, TtsConfig};
 use futures_util::FutureExt;
 use stubs::common::v0::{Coalition, Unit};
 use stubs::mission::v0::stream_events_response::{Event, TtsEvent};
@@ -27,6 +28,7 @@ use crate::srs::SrsClients;
 pub struct Srs {
     tts_config: crate::config::TtsConfig,
     srs_config: crate::config::SrsConfig,
+    write_dir: PathBuf,
     rpc: MissionRpc,
     srs_clients: SrsClients,
     shutdown_signal: ShutdownHandle,
@@ -36,6 +38,7 @@ impl Srs {
     pub fn new(
         tts_config: crate::config::TtsConfig,
         srs_config: crate::config::SrsConfig,
+        write_dir: PathBuf,
         rpc: MissionRpc,
         srs_clients: SrsClients,
         shutdown_signal: ShutdownHandle,
@@ -43,6 +46,7 @@ impl Srs {
         Self {
             tts_config,
             srs_config,
+            write_dir,
             rpc,
             srs_clients,
             shutdown_signal,
@@ -105,6 +109,10 @@ impl SrsService for Srs {
                 TtsProvider::Win => {
                     transmit_request::Provider::Win(transmit_request::Windows { voice: None })
                 }
+                TtsProvider::Piper => transmit_request::Provider::Piper(transmit_request::Piper {
+                    voice: None,
+                    speed: None,
+                }),
             }) {
             transmit_request::Provider::Aws(transmit_request::Aws { voice }) => {
                 TtsConfig::Aws(AwsConfig {
@@ -215,6 +223,32 @@ impl SrsService for Srs {
                     "Windows TTS is only available on Windows",
                 ));
             }
+            transmit_request::Provider::Piper(transmit_request::Piper { voice, speed }) => {
+                TtsConfig::Piper(PiperConfig {
+                    voice: voice
+                        .or_else(|| {
+                            self.tts_config
+                                .provider
+                                .as_ref()
+                                .and_then(|p| p.piper.as_ref())
+                                .and_then(|p| p.default_voice.clone())
+                        })
+                        .filter(|v| !v.is_empty())
+                        .ok_or_else(|| {
+                            Status::failed_precondition("tts.provider.piper.default_voice not set")
+                        })?,
+                    speed: speed
+                        .or_else(|| {
+                            self.tts_config
+                                .provider
+                                .as_ref()
+                                .and_then(|p| p.piper.as_ref())
+                                .and_then(|p| p.default_speed)
+                        })
+                        .unwrap_or(1.0),
+                    piper_path: self.write_dir.join("Mods/tech/DCS-gRPC/piper"),
+                })
+            }
         };
 
         let frames = ::tts::synthesize(&request.ssml, &config)
diff --git a/src/server.rs b/src/server.rs
index c70b8acf..69ce0ed0 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -1,5 +1,6 @@
 use std::future::Future;
 use std::net::SocketAddr;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -49,6 +50,7 @@ struct ServerState {
     stats: Stats,
     tts_config: TtsConfig,
     srs_config: SrsConfig,
+    write_dir: PathBuf,
     srs_transmit: Arc<Mutex<mpsc::Receiver<TransmitRequest>>>,
 }
 
@@ -70,6 +72,7 @@ impl Server {
                 stats: Stats::new(shutdown.handle()),
                 tts_config: config.tts.clone().unwrap_or_default(),
                 srs_config: config.srs.clone().unwrap_or_default(),
+                write_dir: PathBuf::from(&config.write_dir),
                 srs_transmit: Arc::new(Mutex::new(rx)),
             },
             srs_transmit: tx,
@@ -202,6 +205,7 @@ async fn try_run(
         stats,
         tts_config,
         srs_config,
+        write_dir,
         srs_transmit,
     } = state;
 
@@ -225,6 +229,7 @@ async fn try_run(
     let srs = Srs::new(
         tts_config.clone(),
         srs_config.clone(),
+        write_dir.clone(),
         mission_rpc.clone(),
         srs_clients.clone(),
         shutdown_signal.clone(),
@@ -256,6 +261,7 @@ async fn try_run(
         .add_service(SrsServiceServer::new(Srs::new(
             tts_config,
             srs_config,
+            write_dir,
             mission_rpc.clone(),
             srs_clients,
             shutdown_signal.clone(),
diff --git a/tts/src/lib.rs b/tts/src/lib.rs
index 80aad1cc..af04b1b1 100644
--- a/tts/src/lib.rs
+++ b/tts/src/lib.rs
@@ -3,12 +3,14 @@ use std::error;
 pub use aws::{AwsConfig, Region as AwsRegion};
 pub use azure::AzureConfig;
 pub use gcloud::GCloudConfig;
+pub use piper::PiperConfig;
 #[cfg(target_os = "windows")]
 pub use win::WinConfig;
 
 mod aws;
 mod azure;
 mod gcloud;
+mod piper;
 #[cfg(target_os = "windows")]
 mod win;
 
@@ -19,6 +21,7 @@ pub enum TtsConfig {
     GCloud(gcloud::GCloudConfig),
     #[cfg(target_os = "windows")]
     Win(win::WinConfig),
+    Piper(piper::PiperConfig),
 }
 
 /// Synthesize the `text` to speech. Returns a vec of opus frames.
@@ -32,6 +35,7 @@ pub async fn synthesize(
         TtsConfig::GCloud(config) => gcloud::synthesize(text, config).await?,
         #[cfg(target_os = "windows")]
         TtsConfig::Win(config) => win::synthesize(text, config).await?,
+        TtsConfig::Piper(config) => piper::synthesize(text, config).await?,
     })
 }
 
diff --git a/tts/src/piper.rs b/tts/src/piper.rs
new file mode 100644
index 00000000..2d716190
--- /dev/null
+++ b/tts/src/piper.rs
@@ -0,0 +1,57 @@
+use std::path::PathBuf;
+use std::process::Stdio;
+
+use tokio::io::AsyncWriteExt;
+use tokio::process::Command;
+
+#[derive(Debug)]
+pub struct PiperConfig {
+    pub voice: String,
+    pub speed: f32,
+    pub piper_path: PathBuf,
+}
+
+pub async fn synthesize(text: &str, config: &PiperConfig) -> Result<Vec<Vec<u8>>, std::io::Error> {
+    let mut command = Command::new(config.piper_path.join("piper.exe"));
+    command
+        .arg("--model")
+        .arg(&config.voice)
+        .arg("--length_scale")
+        .arg(format!("{}", config.speed))
+        .arg("--output-raw")
+        .current_dir(&config.piper_path)
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped());
+
+    #[cfg(target_os = "windows")]
+    {
+        const CREATE_NO_WINDOW: u32 = 0x08000000;
+        command.creation_flags(CREATE_NO_WINDOW);
+    }
+
+    let mut child = command.spawn()?;
+
+    child
+        .stdin
+        .as_mut()
+        .unwrap()
+        .write_all(text.as_bytes())
+        .await?;
+    let output = child.wait_with_output().await?;
+
+    if !output.status.success() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            if output.stderr.is_empty() {
+                "failed to execute piper (maybe voice model not found)".into()
+            } else {
+                String::from_utf8_lossy(&output.stderr)
+            },
+        ));
+    }
+
+    crate::wav_to_opus(output.stdout.into())
+        .await
+        .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))
+}