From 91e5d06891f1f883644ec4dc96fcc9f7b1b07c85 Mon Sep 17 00:00:00 2001 From: Markus Ast Date: Fri, 24 May 2024 16:39:22 +0200 Subject: [PATCH] Add Piper as TTS provider --- README.md | 18 +++++++++++ lua/DCS-gRPC/grpc-mission.lua | 2 +- lua/Hooks/DCS-gRPC.lua | 2 +- protos/dcs/srs/v0/srs.proto | 12 ++++++++ src/config.rs | 9 ++++++ src/rpc/srs.rs | 36 +++++++++++++++++++++- src/server.rs | 6 ++++ tts/src/lib.rs | 4 +++ tts/src/piper.rs | 57 +++++++++++++++++++++++++++++++++++ 9 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 tts/src/piper.rs diff --git a/README.md b/README.md index 7b7a18f0..632ad9fa 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,12 @@ tts.provider.gcloud.defaultVoice = "en-GB-Neural2-A" -- Requires at least Windows Server 2019 to work properly. tts.provider.win.defaultVoice = "David" +-- The default Piper language model to use (must be installed manually). +tts.provider.piper.defaultVoice = "..." + +-- The default Piper speech speed (1.0 is the default; lower is quicker, higher is slower). +tts.provider.piper.defaultSpeed = 1.0 + -- Your SRS server's address. srs.addr = "127.0.0.1:5002" ``` @@ -167,6 +173,17 @@ You can also check for the present of a `\Logs\grpc.log` file. The server will be running on port 50051 by default. +## Install Piper TTS + +This is only necessary if you plan to use Piper as your TTS provider. + +1. Download `piper_windows_amd64.zip` from the latest [Piper release](https://github.com/rhasspy/piper/releases). +2. Extract the `piper` directory from this zip file and place it at `DCS.openbeta\Mods\tech\DCS-gRPC\piper`. +3. Download at least one voice from [Piper Voices](https://github.com/rhasspy/piper/blob/master/VOICES.md). You need both the `model` and the `config`. For the SRS voice quality, a `low` model is sufficient. +4. Place the model and config into your `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory (e.g. `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx` and `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx.json`. +5. Set one of your installed voices as the default voice in your config (`tts.provider.piper.defaultVoice = "..."`, e.g. `tts.provider.piper.defaultVoice = "en_US-amy-low.onnx"`). +6. If you want to use Piper, don't forget to set it as your default provider, or enable it on a per-transmission basis. + ## Lua API `DCS-gRPC` provides the following Lua APIs to interact with the server from within Lua. @@ -202,6 +219,7 @@ The server will be running on port 50051 by default. -- `= { azure = {} }` / `= { azure = { voice = "..." } }` enable Azure TTS -- `= { gcloud = {} }` / `= { gcloud = { voice = "..." } }` enable Google Cloud TTS -- `= { win = {} }` / `= { win = { voice = "..." } }` enable Windows TTS + -- `= { piper = {} }` / `= { piper = { voice = "...", speed = 1.0 } }` enable Piper TTS provider = null, } ``` diff --git a/lua/DCS-gRPC/grpc-mission.lua b/lua/DCS-gRPC/grpc-mission.lua index da41487e..90da4159 100644 --- a/lua/DCS-gRPC/grpc-mission.lua +++ b/lua/DCS-gRPC/grpc-mission.lua @@ -1,7 +1,7 @@ if not GRPC then GRPC = { -- scaffold nested tables to allow direct assignment in config file - tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } }, + tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } }, srs = {}, } end diff --git a/lua/Hooks/DCS-gRPC.lua b/lua/Hooks/DCS-gRPC.lua index c14e8a97..550f84d7 100644 --- a/lua/Hooks/DCS-gRPC.lua +++ b/lua/Hooks/DCS-gRPC.lua @@ -7,7 +7,7 @@ local function init() if not GRPC then _G.GRPC = { -- scaffold nested tables to allow direct assignment in config file - tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } }, + tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } }, srs = {}, } end diff --git a/protos/dcs/srs/v0/srs.proto b/protos/dcs/srs/v0/srs.proto index c813e7b0..093c6fd6 100644 --- a/protos/dcs/srs/v0/srs.proto +++ b/protos/dcs/srs/v0/srs.proto @@ -75,6 +75,16 @@ message TransmitRequest { optional string voice = 1; } + message Piper { + // The voice model the text is synthesized in (corresponds to a model placed in your + // `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory). + optional string voice = 1; + + // The speed of the generated speech; 1.0 is the default; lower is quicker, higher is slower. + optional float speed = 2; + } + + // Optional TTS provider to be use. Defaults to the one configured in your // config or to Windows' built-in TTS. oneof provider { @@ -82,6 +92,8 @@ message TransmitRequest { Azure azure = 9; GCloud gcloud = 10; Windows win = 11; + // Piper does not support SSML, only use it with plain text. + Piper piper = 12; } } diff --git a/src/config.rs b/src/config.rs index f626090b..027d3860 100644 --- a/src/config.rs +++ b/src/config.rs @@ -38,6 +38,7 @@ pub struct TtsProviderConfig { pub azure: Option, pub gcloud: Option, pub win: Option, + pub piper: Option, } #[derive(Debug, Clone, Default, Deserialize, Serialize)] @@ -48,6 +49,7 @@ pub enum TtsProvider { GCloud, #[default] Win, + Piper, } #[derive(Clone, Deserialize, Serialize)] @@ -80,6 +82,13 @@ pub struct WinConfig { pub default_voice: Option, } +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct PiperConfig { + pub default_voice: Option, + pub default_speed: Option, +} + #[derive(Debug, Clone, Default, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct SrsConfig { diff --git a/src/rpc/srs.rs b/src/rpc/srs.rs index cdd0782a..d746ca57 100644 --- a/src/rpc/srs.rs +++ b/src/rpc/srs.rs @@ -1,13 +1,14 @@ use std::error; use std::future::Future; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::PathBuf; use std::str::FromStr; use std::time::{Duration, Instant}; use ::srs::Sender; #[cfg(target_os = "windows")] use ::tts::WinConfig; -use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, TtsConfig}; +use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, PiperConfig, TtsConfig}; use futures_util::FutureExt; use stubs::common::v0::{Coalition, Unit}; use stubs::mission::v0::stream_events_response::{Event, TtsEvent}; @@ -27,6 +28,7 @@ use crate::srs::SrsClients; pub struct Srs { tts_config: crate::config::TtsConfig, srs_config: crate::config::SrsConfig, + write_dir: PathBuf, rpc: MissionRpc, srs_clients: SrsClients, shutdown_signal: ShutdownHandle, @@ -36,6 +38,7 @@ impl Srs { pub fn new( tts_config: crate::config::TtsConfig, srs_config: crate::config::SrsConfig, + write_dir: PathBuf, rpc: MissionRpc, srs_clients: SrsClients, shutdown_signal: ShutdownHandle, @@ -43,6 +46,7 @@ impl Srs { Self { tts_config, srs_config, + write_dir, rpc, srs_clients, shutdown_signal, @@ -105,6 +109,10 @@ impl SrsService for Srs { TtsProvider::Win => { transmit_request::Provider::Win(transmit_request::Windows { voice: None }) } + TtsProvider::Piper => transmit_request::Provider::Piper(transmit_request::Piper { + voice: None, + speed: None, + }), }) { transmit_request::Provider::Aws(transmit_request::Aws { voice }) => { TtsConfig::Aws(AwsConfig { @@ -215,6 +223,32 @@ impl SrsService for Srs { "Windows TTS is only available on Windows", )); } + transmit_request::Provider::Piper(transmit_request::Piper { voice, speed }) => { + TtsConfig::Piper(PiperConfig { + voice: voice + .or_else(|| { + self.tts_config + .provider + .as_ref() + .and_then(|p| p.piper.as_ref()) + .and_then(|p| p.default_voice.clone()) + }) + .filter(|v| !v.is_empty()) + .ok_or_else(|| { + Status::failed_precondition("tts.provider.piper.default_voice not set") + })?, + speed: speed + .or_else(|| { + self.tts_config + .provider + .as_ref() + .and_then(|p| p.piper.as_ref()) + .and_then(|p| p.default_speed) + }) + .unwrap_or(1.0), + piper_path: self.write_dir.join("Mods/tech/DCS-gRPC/piper"), + }) + } }; let frames = ::tts::synthesize(&request.ssml, &config) diff --git a/src/server.rs b/src/server.rs index c70b8acf..69ce0ed0 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,5 +1,6 @@ use std::future::Future; use std::net::SocketAddr; +use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -49,6 +50,7 @@ struct ServerState { stats: Stats, tts_config: TtsConfig, srs_config: SrsConfig, + write_dir: PathBuf, srs_transmit: Arc>>, } @@ -70,6 +72,7 @@ impl Server { stats: Stats::new(shutdown.handle()), tts_config: config.tts.clone().unwrap_or_default(), srs_config: config.srs.clone().unwrap_or_default(), + write_dir: PathBuf::from(&config.write_dir), srs_transmit: Arc::new(Mutex::new(rx)), }, srs_transmit: tx, @@ -202,6 +205,7 @@ async fn try_run( stats, tts_config, srs_config, + write_dir, srs_transmit, } = state; @@ -225,6 +229,7 @@ async fn try_run( let srs = Srs::new( tts_config.clone(), srs_config.clone(), + write_dir.clone(), mission_rpc.clone(), srs_clients.clone(), shutdown_signal.clone(), @@ -256,6 +261,7 @@ async fn try_run( .add_service(SrsServiceServer::new(Srs::new( tts_config, srs_config, + write_dir, mission_rpc.clone(), srs_clients, shutdown_signal.clone(), diff --git a/tts/src/lib.rs b/tts/src/lib.rs index 80aad1cc..af04b1b1 100644 --- a/tts/src/lib.rs +++ b/tts/src/lib.rs @@ -3,12 +3,14 @@ use std::error; pub use aws::{AwsConfig, Region as AwsRegion}; pub use azure::AzureConfig; pub use gcloud::GCloudConfig; +pub use piper::PiperConfig; #[cfg(target_os = "windows")] pub use win::WinConfig; mod aws; mod azure; mod gcloud; +mod piper; #[cfg(target_os = "windows")] mod win; @@ -19,6 +21,7 @@ pub enum TtsConfig { GCloud(gcloud::GCloudConfig), #[cfg(target_os = "windows")] Win(win::WinConfig), + Piper(piper::PiperConfig), } /// Synthesize the `text` to speech. Returns a vec of opus frames. @@ -32,6 +35,7 @@ pub async fn synthesize( TtsConfig::GCloud(config) => gcloud::synthesize(text, config).await?, #[cfg(target_os = "windows")] TtsConfig::Win(config) => win::synthesize(text, config).await?, + TtsConfig::Piper(config) => piper::synthesize(text, config).await?, }) } diff --git a/tts/src/piper.rs b/tts/src/piper.rs new file mode 100644 index 00000000..2d716190 --- /dev/null +++ b/tts/src/piper.rs @@ -0,0 +1,57 @@ +use std::path::PathBuf; +use std::process::Stdio; + +use tokio::io::AsyncWriteExt; +use tokio::process::Command; + +#[derive(Debug)] +pub struct PiperConfig { + pub voice: String, + pub speed: f32, + pub piper_path: PathBuf, +} + +pub async fn synthesize(text: &str, config: &PiperConfig) -> Result>, std::io::Error> { + let mut command = Command::new(config.piper_path.join("piper.exe")); + command + .arg("--model") + .arg(&config.voice) + .arg("--length_scale") + .arg(format!("{}", config.speed)) + .arg("--output-raw") + .current_dir(&config.piper_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + #[cfg(target_os = "windows")] + { + const CREATE_NO_WINDOW: u32 = 0x08000000; + command.creation_flags(CREATE_NO_WINDOW); + } + + let mut child = command.spawn()?; + + child + .stdin + .as_mut() + .unwrap() + .write_all(text.as_bytes()) + .await?; + let output = child.wait_with_output().await?; + + if !output.status.success() { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + if output.stderr.is_empty() { + "failed to execute piper (maybe voice model not found)".into() + } else { + String::from_utf8_lossy(&output.stderr) + }, + )); + } + + crate::wav_to_opus(output.stdout.into()) + .await + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err)) +}