Skip to content

Instantly share code, notes, and snippets.

@lselden
Created February 6, 2020 21:53
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lselden/cde51ca2debdd7c7c16ea4abf621f8d5 to your computer and use it in GitHub Desktop.
Save lselden/cde51ca2debdd7c7c16ea4abf621f8d5 to your computer and use it in GitHub Desktop.
TTS using powershell
<#
.SYNOPSIS
Speak text using SSML
.DESCRIPTION
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s. This uses the .NET System.Speech library, which uses the older SAPI5 synthesis system.
Based on code from https://github.com/marak/say.js/
.PARAMETER Text
(default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.
.PARAMETER Path
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)
.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}
.PARAMETER Voice
Name of voice to use. To get list of voices use -listVoices option
.PARAMETER Rate
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast
.PARAMETER Volume
Volume in range 0-1, 1 (default) is full volume
.PARAMETER SampleRate
SampleRate of output WAV file. Default is 24000
.PARAMETER Channels
Number of channels of output WAV file. Default is 1
.PARAMETER Lang
Language to use. Default is "en-US".
.INPUTS
System.String. You can pipe the "Text" parameter into the script
.OUTPUTS
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}
.EXAMPLE
PS> ./out-ssml.ps1 "hello world"
#>
param(
[Parameter(ValueFromPipeline = $true)] [string] $text,
[Parameter(Mandatory = $false)] [string] $voice,
[Parameter(Mandatory = $false)]
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
[Parameter(Mandatory = $false)]
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
[Parameter(Mandatory = $false)] [string] $path,
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
[Parameter(Mandatory = $false)] [int] $channels = 1,
[Parameter(Mandatory = $false)] [string] $lang = 'en-US',
[Switch] $listVoices
)
begin {
Add-Type -AssemblyName System.speech;
# Start-Sleep -Milliseconds 1;
$script:finishedBookmarkName = '__psspeak_finished';
$script:endPause = '50ms';
function FixSSML($text, $lang) {
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis';
if (-not $text.Trim().StartsWith('<speak')) {
# escape xml
$text = [System.Security.SecurityElement]::Escape($text);
$text = "<speak version=`"1.0`" xml:lang=`"$lang`">$text</speak>";
}
# NOTE will throw error on invalid input
$dom = [xml]$text;
$dom.speak.SetAttribute('version', '1.0');
if (-not $dom.speak.GetAttribute('xml:lang')) {
$dom.speak.SetAttribute('xml:lang', $lang);
}
$dom.speak.SetAttribute('xmlns', $ssmlNamespace);
$lastBreak = $dom.CreateElement('break');
$lastBreak.SetAttribute('time', $script:endPause) | Out-Null;
$dom.speak.AppendChild($lastBreak) | Out-Null;
$lastMark = $dom.CreateElement('mark');
$lastMark.SetAttribute('name', $script:finishedBookmarkName) | Out-Null;
$dom.speak.AppendChild($lastMark) | Out-Null;
return $dom.speak.OuterXml;
}
$speak = [System.Speech.Synthesis.SpeechSynthesizer]::new();
}
process {
if ($listVoices) {
$voices = $speak.GetInstalledVoices();
return $voices | ? { $_.Enabled -eq $true } | % {
$info = $_.VoiceInfo;
[PSCustomObject]@{
languageCode = $info.Culture.ToString();
id = $info.Id;
name = $info.Name;
ssmlGender = $info.Gender;
}
}
}
if ($voice) {
$speak.SelectVoice($voice);
if ($speak.Voice.Culture.ToString() -ne $lang) {
# write warning?
# better option would be to add voice element
$lang = $speak.Voice.Culture.ToString();
}
}
if ($rate -ne 1.0) {
$speak.Rate = [math]::max(-10,
[math]::Min(
[math]::Round((9.0686 * [math]::Log($rate)) - 0.1806),
10
)
);
}
if ($volume -ne 1.0) {
$speak.Volume = [int]($volume * 100);
}
if ($path) {
$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
$path;
} else {
[System.IO.Path]::GetFullPath((join-path $pwd $path))
}
$format = [System.Speech.AudioFormat.SpeechAudioFormatInfo]::new($sampleRate, 16, $channels);
$speak.SetOutputToWaveFile($filepath, $format);
}
$rawtext = $text;
$text = FixSSML $text $lang;
$script:output = [PSCustomObject]@{
voice = $speak.Voice.Name;
input = $rawtext.Substring(0, [math]::Min($rawtext.Length, 2048));
rate = $rate;
volume = $volume;
duration = [int]0;
marks = @();
};
# $script:stats = @{
# start = get-date
# }
# $speak.Add_VisemeReached({
# param(
# [object]$sender,
# [System.Speech.Synthesis.VisemeReachedEventArgs]$evt
# );
# write-host "progress $($evt.AudioPosition.TotalMilliseconds) dur=$($evt.Duration.TotalMilliseconds) $($evt.Viseme)"
# })
$speak.Add_BookmarkReached({
param(
[object]$sender,
[System.Speech.Synthesis.BookmarkReachedEventArgs]$evt
);
$name = $evt.Bookmark;
$time = $evt.AudioPosition.TotalMilliseconds;
if ($name -eq $script:finishedBookmarkName) {
$script:output.duration = $time;
# Write-Host "done! $time";
# $script:stats.done = get-date;
} else {
$script:output.marks += [pscustomobject]@{
time = $time;
value = $name;
}
}
});
try {
$speak.SpeakSsml($text);
} catch {
Write-Error "Fail! $_";
Write-Host $text;
}
# $script:stats.end = Get-Date;
$speak.SetOutputToNull();
# Write-Host "$(($stats.done - $stats.start).TotalMilliseconds) and to end is $(($stats.end - $stats.start).TotalMilliseconds)"
$output;
}
end {
$speak.SetOutputToNull();
$speak.Dispose();
}
<#
.SYNOPSIS
Speak text using SSML.
.DESCRIPTION
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s
It uses the newer WinRT (Universal Windows Runtime) to perform synthesis, rather than the older SAPI5 .Net engine. Therefore, it'll only work on Windows 10.
Based on code from https://github.com/marak/say.js/
.PARAMETER Text
(default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.
.PARAMETER Path
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)
.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}
.PARAMETER Voice
Name of voice to use. To get list of voices use -listVoices option
.PARAMETER Rate
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast
.PARAMETER Volume
Volume in range 0-1, 1 (default) is full volume
.PARAMETER SampleRate
SampleRate of output WAV file. Default is 24000
.PARAMETER Channels
Number of channels of output WAV file. Default is 1
.PARAMETER Lang
Language to use. Default is the default voice's language.
.PARAMETER SpeechMarkTypes
Marks to include in output. Default is sentence,word,ssml. set to "" to not output any marks
.INPUTS
System.String. You can pipe the "Text" parameter into the script
.OUTPUTS
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}
.EXAMPLE
PS> ./say-ssml.ps1 "hello world"
#>
param(
[Parameter(ValueFromPipeline = $true)] [string] $text,
[Parameter(Mandatory = $false)] [string] $voice,
[Parameter(Mandatory = $false)]
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
[Parameter(Mandatory = $false)]
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
[Parameter(Mandatory = $false)] [string] $path,
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
[Parameter(Mandatory = $false)] [int] $channels = 1,
[Parameter(Mandatory = $false)] [string] $lang,
[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml",
[Switch] $listVoices
)
begin {
Add-Type -AssemblyName System.Runtime.WindowsRuntime
[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime]
[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime]
$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? {
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1
}
$asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0];
$asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0];
Function Await($WinRtTask, $ResultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
$netTask.Result
}
Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) {
$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
}
Function ParseMarkers($timedTextTracks) {
$list = @()
$timedTextTracks | % {
$markType = switch($_.Id) {
"SpeechWord" { "word" }
"SpeechSentence" { "sentence" }
"SpeechViseme" { "viseme" }
"SpeechBookmark" { "ssml" }
Default { "unknown" }
}
$_.Cues | % {
$payload = if ($_.StartPositionInInput) {
[PSCustomObject]@{
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
start = $_.StartPositionInInput
end = $_.EndPositionInInput
};
} else {
[PSCustomObject]@{
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
};
}
if ($payload.value) {
$list += $payload;
}
}
}
$list
}
Function PlayWave([System.Byte[]]$bytes) {
$memstream = [System.IO.MemoryStream]::new($bytes);
$player = [System.Media.SoundPlayer]::new($memstream)
$player.PlaySync();
$player.Dispose();
$memstream.Dispose();
}
Function SaveWave($path, [System.Byte[]]$bytes) {
$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
$path;
} else {
[System.IO.Path]::GetFullPath((join-path $pwd $path))
}
[System.IO.File]::WriteAllBytes($filepath, $bytes)
}
$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;
if (-not $voices.Id) {
Write-Debug "Unable to get installed voices list. Script will only use default voice";
$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice);
}
}
process {
if ($listVoices) {
return $script:voices | % {
[PSCustomObject]@{
languageCode = $_.Language
id = $_.DisplayName;
name = $_.Description;
ssmlGender = $_.Gender;
}
}
}
if (-not $text) {
Write-Error "No text specified";
return;
}
$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new()
$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min
if ($voice) {
$voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1
if ($voiceInfo) {
$speech.Voice = $voiceInfo;
} else {
Write-Debug "No voice found matching $voice"
}
}
if ($speechMarkTypes -match 'sentence') {
$speech.Options.IncludeSentenceBoundaryMetadata = $true;
}
if ($speechMarkTypes -match 'words') {
$speech.Options.IncludeWordBoundaryMetadata = $true;
}
if ($rate -ne 1.0) {
$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0);
}
if ($volume -ne 1.0) {
$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0);
}
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis';
if (-not $text.Trim().StartsWith('<speak')) {
$text = [System.Security.SecurityElement]::Escape($text);
$text = "<speak version=`"1.0`">$text</speak>";
}
$dom = [xml]$text;
$dom.speak.SetAttribute('version', '1.0');
$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language);
$dom.speak.SetAttribute('xmlns', $ssmlNamespace);
$text = $dom.speak.OuterXml;
# actually speak - create data stream
try {
$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]);
} catch {
Write-Error "Error creating stream $_";
if ($_.InnerExceptions -and $_.InnerExceptions.Count) {
$_.InnerExceptions | % {
Write-Error "$($_.GetType().Name), $($_.Message)";
}
}
return;
}
if (-not $stream.Size) {
# error occurred
Write-Error "Error Creating Synthesis Stream - no results"
return;
}
if ($speechMarkTypes -ne '') {
$markers = ParseMarkers $stream.TimedMetadataTracks
$markers
}
# create destination buffer
$bytes = [array]::CreateInstance([byte], $stream.Size);
[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes);
# wait for buffer copy
AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32])
#write out
if ($path) {
SaveWave $path $bytes;
} else {
PlayWave $bytes
}
}
end {
if ($stream) {
$stream.Dispose();
}
if ($speech) {
$speech.Dispose();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment