Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Piper as TTS provider #261

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ tts.provider.gcloud.defaultVoice = "en-GB-Neural2-A"
-- Requires at least Windows Server 2019 to work properly.
tts.provider.win.defaultVoice = "David"

-- The default Piper language model to use (must be installed manually).
tts.provider.piper.defaultVoice = "..."

-- The default Piper speech speed (1.0 is the default; lower is quicker, higher is slower).
tts.provider.piper.defaultSpeed = 1.0

-- Your SRS server's address.
srs.addr = "127.0.0.1:5002"
```
Expand Down Expand Up @@ -167,6 +173,17 @@ You can also check for the present of a `\Logs\grpc.log` file.

The server will be running on port 50051 by default.

## Install Piper TTS

This is only necessary if you plan to use Piper as your TTS provider.

1. Download `piper_windows_amd64.zip` from the latest [Piper release](https://github.com/rhasspy/piper/releases).
2. Extract the `piper` directory from this zip file and place it at `DCS.openbeta\Mods\tech\DCS-gRPC\piper`.
3. Download at least one voice from [Piper Voices](https://github.com/rhasspy/piper/blob/master/VOICES.md). You need both the `model` and the `config`. For the SRS voice quality, a `low` model is sufficient.
4. Place the model and config into your `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory (e.g. `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx` and `DCS.openbeta\Mods\tech\DCS-gRPC\piper\en_US-amy-low.onnx.json`.
5. Set one of your installed voices as the default voice in your config (`tts.provider.piper.defaultVoice = "..."`, e.g. `tts.provider.piper.defaultVoice = "en_US-amy-low.onnx"`).
6. If you want to use Piper, don't forget to set it as your default provider, or enable it on a per-transmission basis.

## Lua API

`DCS-gRPC` provides the following Lua APIs to interact with the server from within Lua.
Expand Down Expand Up @@ -202,6 +219,7 @@ The server will be running on port 50051 by default.
-- `= { azure = {} }` / `= { azure = { voice = "..." } }` enable Azure TTS
-- `= { gcloud = {} }` / `= { gcloud = { voice = "..." } }` enable Google Cloud TTS
-- `= { win = {} }` / `= { win = { voice = "..." } }` enable Windows TTS
-- `= { piper = {} }` / `= { piper = { voice = "...", speed = 1.0 } }` enable Piper TTS
provider = null,
}
```
Expand Down
2 changes: 1 addition & 1 deletion lua/DCS-gRPC/grpc-mission.lua
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
if not GRPC then
GRPC = {
-- scaffold nested tables to allow direct assignment in config file
tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } },
tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } },
srs = {},
}
end
Expand Down
2 changes: 1 addition & 1 deletion lua/Hooks/DCS-gRPC.lua
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ local function init()
if not GRPC then
_G.GRPC = {
-- scaffold nested tables to allow direct assignment in config file
tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {} } },
tts = { provider = { gcloud = {}, aws = {}, azure = {}, win = {}, piper = {} } },
srs = {},
}
end
Expand Down
12 changes: 12 additions & 0 deletions protos/dcs/srs/v0/srs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,25 @@ message TransmitRequest {
optional string voice = 1;
}

message Piper {
// The voice model the text is synthesized in (corresponds to a model placed in your
// `DCS.openbeta\Mods\tech\DCS-gRPC\piper\` directory).
optional string voice = 1;

// The speed of the generated speech; 1.0 is the default; lower is quicker, higher is slower.
optional float speed = 2;
}


// Optional TTS provider to be use. Defaults to the one configured in your
// config or to Windows' built-in TTS.
oneof provider {
Aws aws = 8;
Azure azure = 9;
GCloud gcloud = 10;
Windows win = 11;
// Piper does not support SSML, only use it with plain text.
Piper piper = 12;
}
}

Expand Down
9 changes: 9 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ pub struct TtsProviderConfig {
pub azure: Option<AzureConfig>,
pub gcloud: Option<GCloudConfig>,
pub win: Option<WinConfig>,
pub piper: Option<PiperConfig>,
}

#[derive(Debug, Clone, Default, Deserialize, Serialize)]
Expand All @@ -48,6 +49,7 @@ pub enum TtsProvider {
GCloud,
#[default]
Win,
Piper,
}

#[derive(Clone, Deserialize, Serialize)]
Expand Down Expand Up @@ -80,6 +82,13 @@ pub struct WinConfig {
pub default_voice: Option<String>,
}

#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct PiperConfig {
pub default_voice: Option<String>,
pub default_speed: Option<f32>,
}

#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct SrsConfig {
Expand Down
36 changes: 35 additions & 1 deletion src/rpc/srs.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use std::error;
use std::future::Future;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::{Duration, Instant};

use ::srs::Sender;
#[cfg(target_os = "windows")]
use ::tts::WinConfig;
use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, TtsConfig};
use ::tts::{AwsConfig, AwsRegion, AzureConfig, GCloudConfig, PiperConfig, TtsConfig};
use futures_util::FutureExt;
use stubs::common::v0::{Coalition, Unit};
use stubs::mission::v0::stream_events_response::{Event, TtsEvent};
Expand All @@ -27,6 +28,7 @@ use crate::srs::SrsClients;
pub struct Srs {
tts_config: crate::config::TtsConfig,
srs_config: crate::config::SrsConfig,
write_dir: PathBuf,
rpc: MissionRpc,
srs_clients: SrsClients,
shutdown_signal: ShutdownHandle,
Expand All @@ -36,13 +38,15 @@ impl Srs {
pub fn new(
tts_config: crate::config::TtsConfig,
srs_config: crate::config::SrsConfig,
write_dir: PathBuf,
rpc: MissionRpc,
srs_clients: SrsClients,
shutdown_signal: ShutdownHandle,
) -> Self {
Self {
tts_config,
srs_config,
write_dir,
rpc,
srs_clients,
shutdown_signal,
Expand Down Expand Up @@ -105,6 +109,10 @@ impl SrsService for Srs {
TtsProvider::Win => {
transmit_request::Provider::Win(transmit_request::Windows { voice: None })
}
TtsProvider::Piper => transmit_request::Provider::Piper(transmit_request::Piper {
voice: None,
speed: None,
}),
}) {
transmit_request::Provider::Aws(transmit_request::Aws { voice }) => {
TtsConfig::Aws(AwsConfig {
Expand Down Expand Up @@ -215,6 +223,32 @@ impl SrsService for Srs {
"Windows TTS is only available on Windows",
));
}
transmit_request::Provider::Piper(transmit_request::Piper { voice, speed }) => {
TtsConfig::Piper(PiperConfig {
voice: voice
.or_else(|| {
self.tts_config
.provider
.as_ref()
.and_then(|p| p.piper.as_ref())
.and_then(|p| p.default_voice.clone())
})
.filter(|v| !v.is_empty())
.ok_or_else(|| {
Status::failed_precondition("tts.provider.piper.default_voice not set")
})?,
speed: speed
.or_else(|| {
self.tts_config
.provider
.as_ref()
.and_then(|p| p.piper.as_ref())
.and_then(|p| p.default_speed)
})
.unwrap_or(1.0),
piper_path: self.write_dir.join("Mods/tech/DCS-gRPC/piper"),
})
}
};

let frames = ::tts::synthesize(&request.ssml, &config)
Expand Down
6 changes: 6 additions & 0 deletions src/server.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::future::Future;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;

Expand Down Expand Up @@ -49,6 +50,7 @@ struct ServerState {
stats: Stats,
tts_config: TtsConfig,
srs_config: SrsConfig,
write_dir: PathBuf,
srs_transmit: Arc<Mutex<mpsc::Receiver<TransmitRequest>>>,
}

Expand All @@ -70,6 +72,7 @@ impl Server {
stats: Stats::new(shutdown.handle()),
tts_config: config.tts.clone().unwrap_or_default(),
srs_config: config.srs.clone().unwrap_or_default(),
write_dir: PathBuf::from(&config.write_dir),
srs_transmit: Arc::new(Mutex::new(rx)),
},
srs_transmit: tx,
Expand Down Expand Up @@ -202,6 +205,7 @@ async fn try_run(
stats,
tts_config,
srs_config,
write_dir,
srs_transmit,
} = state;

Expand All @@ -225,6 +229,7 @@ async fn try_run(
let srs = Srs::new(
tts_config.clone(),
srs_config.clone(),
write_dir.clone(),
mission_rpc.clone(),
srs_clients.clone(),
shutdown_signal.clone(),
Expand Down Expand Up @@ -256,6 +261,7 @@ async fn try_run(
.add_service(SrsServiceServer::new(Srs::new(
tts_config,
srs_config,
write_dir,
mission_rpc.clone(),
srs_clients,
shutdown_signal.clone(),
Expand Down
4 changes: 4 additions & 0 deletions tts/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ use std::error;
pub use aws::{AwsConfig, Region as AwsRegion};
pub use azure::AzureConfig;
pub use gcloud::GCloudConfig;
pub use piper::PiperConfig;
#[cfg(target_os = "windows")]
pub use win::WinConfig;

mod aws;
mod azure;
mod gcloud;
mod piper;
#[cfg(target_os = "windows")]
mod win;

Expand All @@ -19,6 +21,7 @@ pub enum TtsConfig {
GCloud(gcloud::GCloudConfig),
#[cfg(target_os = "windows")]
Win(win::WinConfig),
Piper(piper::PiperConfig),
}

/// Synthesize the `text` to speech. Returns a vec of opus frames.
Expand All @@ -32,6 +35,7 @@ pub async fn synthesize(
TtsConfig::GCloud(config) => gcloud::synthesize(text, config).await?,
#[cfg(target_os = "windows")]
TtsConfig::Win(config) => win::synthesize(text, config).await?,
TtsConfig::Piper(config) => piper::synthesize(text, config).await?,
})
}

Expand Down
57 changes: 57 additions & 0 deletions tts/src/piper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use std::path::PathBuf;
use std::process::Stdio;

use tokio::io::AsyncWriteExt;
use tokio::process::Command;

#[derive(Debug)]
pub struct PiperConfig {
pub voice: String,
pub speed: f32,
pub piper_path: PathBuf,
}

pub async fn synthesize(text: &str, config: &PiperConfig) -> Result<Vec<Vec<u8>>, std::io::Error> {
let mut command = Command::new(config.piper_path.join("piper.exe"));
command
.arg("--model")
.arg(&config.voice)
.arg("--length_scale")
.arg(format!("{}", config.speed))
.arg("--output-raw")
.current_dir(&config.piper_path)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped());

#[cfg(target_os = "windows")]
{
const CREATE_NO_WINDOW: u32 = 0x08000000;
command.creation_flags(CREATE_NO_WINDOW);
}

let mut child = command.spawn()?;

child
.stdin
.as_mut()
.unwrap()
.write_all(text.as_bytes())
.await?;
let output = child.wait_with_output().await?;

if !output.status.success() {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
if output.stderr.is_empty() {
"failed to execute piper (maybe voice model not found)".into()
} else {
String::from_utf8_lossy(&output.stderr)
},
));
}

crate::wav_to_opus(output.stdout.into())
.await
.map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))
}
Loading