Skip to content

Commit

Permalink
use struct for creating ssml
Browse files Browse the repository at this point in the history
  • Loading branch information
czyt committed Apr 14, 2024
1 parent 38aaff9 commit 5e5d227
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 104 deletions.
31 changes: 22 additions & 9 deletions internal/communicate/communicate.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"crypto/tls"
"encoding/binary"
"encoding/json"
"encoding/xml"
"fmt"
"html"
"io"
Expand All @@ -29,7 +30,6 @@ import (

const (
ssmlHeaderTemplate = "X-RequestId:%s\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:%sZ\r\nPath:ssml\r\n\r\n"
ssmlTemplate = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='%s'><prosody pitch='%s' rate='%s' volume='%s'>%s</prosody></voice></speak>"
)

var (
Expand Down Expand Up @@ -464,14 +464,27 @@ func splitTextByByteLength(text string, byteLength int) [][]byte {
}

func makeSsml(text string, pitch, voice string, rate string, volume string) string {
ssml := fmt.Sprintf(
ssmlTemplate,
voice,
pitch,
rate,
volume,
text)
return ssml
ssml := &Speak{
XMLName: xml.Name{Local: "speak"},
Version: "1.0",
Xmlns: "http://www.w3.org/2001/10/synthesis",
Lang: "en-US",
Voice: []Voice{{
Name: voice,
Prosody: Prosody{
Pitch: pitch,
Rate: rate,
Volume: volume,
Text: text,
},
}},
}

output, err := xml.MarshalIndent(ssml, "", " ")
if err != nil {
return ""
}
return string(output)
}

func currentTimeInMST() string {
Expand Down
167 changes: 72 additions & 95 deletions internal/communicate/ssml.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,107 +2,84 @@ package communicate

import "encoding/xml"

// reference document at: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure

type Speak struct {
XMLName xml.Name `xml:"speak"`
Version string `xml:"version,attr"`
Xmlns string `xml:"xmlns,attr"`
Mstts string `xml:"mstts,attr"`
Lang string `xml:"xml:lang,attr"`
Backgroundaudio Backgroundaudio `xml:"mstts:backgroundaudio"`
Voice Voice `xml:"voice"`
}

type Backgroundaudio struct {
Src string `xml:"src,attr"`
Volume string `xml:"volume,attr"`
Fadein string `xml:"fadein,attr"`
Fadeout string `xml:"fadeout,attr"`
XMLName xml.Name `xml:"speak"`
Version string `xml:"version,attr"`
Xmlns string `xml:"xmlns,attr"`
Lang string `xml:"xml:lang,attr"`
Voice []Voice `xml:"voice"`
}

type Voice struct {
Name string `xml:"name,attr"`
Effect string `xml:"effect,attr"`
Audio Audio `xml:"audio"`
Bookmark string `xml:"bookmark,omitempty"`
Break Break `xml:"break,omitempty"`
Emphasis Emphasis `xml:"emphasis,omitempty"`
Lang Lang `xml:"lang"`
Lexicon Lexicon `xml:"lexicon,omitempty"`
Math string `xml:"math,omitempty"`
Mstts Mstts `xml:"mstts,omitempty"`
P string `xml:"p,omitempty"`
Phoneme Phoneme `xml:"phoneme,omitempty"`
Prosody Prosody `xml:"prosody"`
SayAs SayAs `xml:"say-as,omitempty"`
Sub string `xml:"sub,omitempty"`
}

type Audio struct {
Src string `xml:"src"`
}

type Break struct {
Strength string `xml:"strength,attr"`
Time string `xml:"time,attr"`
}

type Emphasis struct {
Level string `xml:"level,attr"`
}

type Lang struct {
XmlLang string `xml:"xml:lang,attr"`
}

type Lexicon struct {
URI string `xml:"uri,attr"`
}

type Mstts struct {
Backgroundaudio string `xml:"backgroundaudio"`
Ttsembedding TtsEmbedding `xml:"ttsembedding"`
ExpressAs ExpressAs `xml:"express-as"`
Silence Silence `xml:"silence"`
Viseme Viseme `xml:"viseme"`
Audioduration string `xml:"audioduration"`
}

type TtsEmbedding struct {
SpeakerProfileId string `xml:"speakerProfileId,attr"`
}

type ExpressAs struct {
Style string `xml:"style,attr"`
Styledegree string `xml:"styledegree,attr"`
Role string `xml:"role,attr"`
}

type Silence struct {
Type string `xml:"type,attr"`
Value string `xml:"value,attr"`
}

type Viseme struct {
Type string `xml:"type,attr"`
}

type Phoneme struct {
Alphabet string `xml:"alphabet,attr"`
Ph string `xml:"ph,attr"`
Name string `xml:"name,attr"`
Prosody Prosody `xml:"prosody"`
}

type Prosody struct {
Pitch string `xml:"pitch,attr"`
// Contour represents changes in pitch. These changes are represented as an array of targets at specified time
//positions in the speech output. Sets of parameter pairs define each target. For example:
//
//<prosody contour="(0%,+20Hz) (10%,-2st) (40%,+10Hz)">
//
//The first value in each set of parameters specifies the location of the pitch change as a percentage of the
//duration of the text. The second value specifies the amount to raise or lower the pitch by using a relative
//value or an enumeration value for pitch (see pitch).
Contour string `xml:"contour,attr,omitempty"`
Range string `xml:"range,attr,omitempty"`
Rate string `xml:"rate,attr"`
Volume string `xml:"volume,attr"`
}

type SayAs struct {
InterpretAs string `xml:"interpret-as,attr"`
Format string `xml:"format,attr"`
Detail string `xml:"detail,attr"`
//Indicates the baseline pitch for the text. Pitch changes can be applied at the sentence level. The pitch changes
//should be within 0.5 to 1.5 times the original audio. You can express the pitch as:
//An absolute value:
//Expressed as a number followed by "Hz" (Hertz). For example, <prosody pitch="600Hz">some text</prosody>.
//A relative value:
// As a relative number: Expressed as a number preceded by "+" or "-" and followed by "Hz" or "st" that specifies
// an amount to change the pitch. For example:
// <prosody pitch="+80Hz">some text</prosody> or <prosody pitch="-2st">some text</prosody>.
// The "st" indicates the change unit is semitone, which is half of a tone (a half step) on the standard diatonic scale.
//As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
//relative change. For example: <prosody pitch="50%">some text</prosody> or <prosody pitch="-50%">some text</prosody>.
// A constant value:
// x-low
// low
// medium
// high
// x-high
// default
Pitch string `xml:"pitch,attr"`
// Indicates the speaking rate of the text. Speaking rate can be applied at the word or sentence level. The rate changes
//should be within 0.5 to 2 times the original audio. You can express rate as:
//A relative value:
// As a relative number: Expressed as a number that acts as a multiplier of the default. For example, a value of 1 results
// in no change in the original rate. A value of 0.5 results in a halving of the original rate. A value of 2 results in
// twice the original rate.
// As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the relative
// change. For example:
// <prosody rate="50%">some text</prosody> or <prosody rate="-50%">some text</prosody>.
// A constant value:
// x-slow
// slow
// medium
// fast
// x-fast
// default
Rate string `xml:"rate,attr"`
// Indicates the volume level of the speaking voice. Volume changes can be applied at the sentence level. You can express
//the volume as:
// An absolute value: Expressed as a number in the range of 0.0 to 100.0, from quietest to loudest, such as 75.
//The default value is 100.0.
// A relative value:
// As a relative number: Expressed as a number preceded by "+" or "-" that specifies an amount to change the volume.
//Examples are +10 or -5.5.
// As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
//relative change. For example:
// <prosody volume="50%">some text</prosody> or <prosody volume="+3%">some text</prosody>.
//
// A constant value:
// silent
// x-soft
// soft
// medium
// loud
// x-loud
// default
Volume string `xml:"volume,attr"`
Text string `xml:",chardata"`
}

0 comments on commit 5e5d227

Please sign in to comment.