Skip to content

Instantly share code, notes, and snippets.

@lselden
Last active December 7, 2023 12:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save lselden/ab2e04fbac785e0644c4b562bf5e35cd to your computer and use it in GitHub Desktop.
Save lselden/ab2e04fbac785e0644c4b562bf5e35cd to your computer and use it in GitHub Desktop.
Powershell script to say text using the WinRT speech synthesis API
<#
.SYNOPSIS
Speak text using SSML
.DESCRIPTION
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s
.PARAMETER Text
(default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.
.PARAMETER Path
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)
.PARAMETER Variable
Output to a global variable as memory stream. Output is always in WAV format (PCM s16le)
.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}
.PARAMETER Voice
Name of voice to use. To get list of voices use -listVoices option
.PARAMETER Rate
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast
.PARAMETER Volume
Volume in range 0-1, 1 (default) is full volume
.PARAMETER SampleRate
SampleRate of output WAV file. Default is 24000
.PARAMETER Channels
Number of channels of output WAV file. Default is 1
.PARAMETER Lang
Language to use. Default is the default voice's language.
.PARAMETER SpeechMarkTypes
Marks to include in output. Default is sentence,words,ssml. set to "" to not output any marks
.INPUTS
System.String. You can pipe the "Text" parameter into the script
.OUTPUTS
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}
.EXAMPLE
PS> ./out-ssml-winrt.ps1 "hello world"
#>
param(
[Parameter(ValueFromPipeline = $true)] [string] $text,
[Parameter(Mandatory = $false)] [string] $voice,
[Parameter(Mandatory = $false)]
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
[Parameter(Mandatory = $false)]
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
[Parameter(Mandatory = $false)] [string] $path,
[Parameter(Mandatory = $false)] [string] $variable,
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
[Parameter(Mandatory = $false)] [int] $channels = 1,
[Parameter(Mandatory = $false)] [string] $lang,
[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml",
[Switch] $listVoices
)
begin {
Add-Type -AssemblyName System.Runtime.WindowsRuntime
[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime]
[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime]
$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? {
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1
}
$asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0];
$asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0];
Function Await($WinRtTask, $ResultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
$netTask.Result
}
Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) {
$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
}
Function ParseMarkers($timedTextTracks) {
$list = @()
$timedTextTracks | % {
$markType = switch($_.Id) {
"SpeechWord" { "word" }
"SpeechSentence" { "sentence" }
"SpeechViseme" { "viseme" }
"SpeechBookmark" { "ssml" }
Default { "unknown" }
}
$_.Cues | % {
$payload = if ($_.StartPositionInInput) {
[PSCustomObject]@{
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
start = $_.StartPositionInInput
end = $_.EndPositionInInput
};
} else {
[PSCustomObject]@{
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
};
}
if ($payload.value) {
$list += $payload;
}
}
}
$list
}
Function PlayWave([System.Byte[]]$bytes) {
$memstream = [System.IO.MemoryStream]::new($bytes);
$player = [System.Media.SoundPlayer]::new($memstream)
$player.PlaySync();
$player.Dispose();
$memstream.Dispose();
}
Function SaveWave($path, [System.Byte[]]$bytes) {
$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
$path;
} else {
[System.IO.Path]::GetFullPath((join-path $pwd $path))
}
[System.IO.File]::WriteAllBytes($filepath, $bytes)
}
Function WaveToVariable($variable, [System.Byte[]]$bytes) {
Set-Variable -Scope 'global' -Name $variable -Value $bytes;
}
$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;
if (-not $voices.Id) {
Write-Debug "Unable to get installed voices list. Script will only use default voice";
$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice);
}
}
process {
if ($listVoices) {
return $script:voices | % {
[PSCustomObject]@{
languageCode = $_.Language
id = $_.DisplayName;
name = $_.Description;
ssmlGender = $_.Gender;
}
}
}
if (-not $text) {
Write-Error "No text specified";
return;
}
$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new()
$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min
if ($voice) {
$voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1
if ($voiceInfo) {
$speech.Voice = $voiceInfo;
} else {
Write-Debug "No voice found matching $voice"
}
}
if ($speechMarkTypes -match 'sentence') {
$speech.Options.IncludeSentenceBoundaryMetadata = $true;
}
if ($speechMarkTypes -match 'words') {
$speech.Options.IncludeWordBoundaryMetadata = $true;
}
if ($rate -ne 1.0) {
$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0);
}
if ($volume -ne 1.0) {
$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0);
}
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis';
if (-not $text.Trim().StartsWith('<speak')) {
$text = [System.Security.SecurityElement]::Escape($text);
$text = "<speak version=`"1.0`">$text</speak>";
}
$dom = [xml]$text;
$dom.speak.SetAttribute('version', '1.0');
$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language);
$dom.speak.SetAttribute('xmlns', $ssmlNamespace);
$text = $dom.speak.OuterXml;
# actually speak - create data stream
try {
$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]);
} catch {
Write-Error "Error creating stream $_";
if ($_.InnerExceptions -and $_.InnerExceptions.Count) {
$_.InnerExceptions | % {
Write-Error "$($_.GetType().Name), $($_.Message)";
}
}
return;
}
if (-not $stream.Size) {
# error occurred
Write-Error "Error Creating Synthesis Stream - no results"
return;
}
if ($speechMarkTypes -ne '') {
$markers = ParseMarkers $stream.TimedMetadataTracks
$markers
}
# create destination buffer
$bytes = [array]::CreateInstance([byte], $stream.Size);
[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes);
# wait for buffer copy
AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32])
#write out
if ($variable) {
WaveToVariable $variable $bytes;
} elseif ($path) {
SaveWave $path $bytes;
} else {
PlayWave $bytes
}
}
end {
if ($stream) {
$stream.Dispose();
}
if ($speech) {
$speech.Dispose();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment