Created February 6, 2020 21:53
TTS using powershell
Speak text using SSML
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s. This uses the .NET System.Speech library, which uses the older SAPI5 synthesis system.
Based on code from
(default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)
.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}
Name of voice to use. To get list of voices use -listVoices option
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast
Volume in range 0-1, 1 (default) is full volume
SampleRate of output WAV file. Default is 24000
Number of channels of output WAV file. Default is 1
Language to use. Default is "en-US".
System.String. You can pipe the "Text" parameter into the script
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}
PS> ./out-ssml.ps1 "hello world"
[Parameter(ValueFromPipeline = $true)] [string] $text,
[Parameter(Mandatory = $false)] [string] $voice,
[Parameter(Mandatory = $false)]
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
[Parameter(Mandatory = $false)]
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
[Parameter(Mandatory = $false)] [string] $path,
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
[Parameter(Mandatory = $false)] [int] $channels = 1,
[Parameter(Mandatory = $false)] [string] $lang = 'en-US',
[Switch] $listVoices
begin {
Add-Type -AssemblyName System.speech;
# Start-Sleep -Milliseconds 1;
$script:finishedBookmarkName = '__psspeak_finished';
$script:endPause = '50ms';
function FixSSML($text, $lang) {
$ssmlNamespace = '';
if (-not $text.Trim().StartsWith('<speak')) {
# escape xml
$text = [System.Security.SecurityElement]::Escape($text);
$text = "<speak version=`"1.0`" xml:lang=`"$lang`">$text</speak>";
# NOTE will throw error on invalid input
$dom = [xml]$text;
$dom.speak.SetAttribute('version', '1.0');
if (-not $dom.speak.GetAttribute('xml:lang')) {
$dom.speak.SetAttribute('xml:lang', $lang);
$dom.speak.SetAttribute('xmlns', $ssmlNamespace);
$lastBreak = $dom.CreateElement('break');
$lastBreak.SetAttribute('time', $script:endPause) | Out-Null;
$dom.speak.AppendChild($lastBreak) | Out-Null;
$lastMark = $dom.CreateElement('mark');
$lastMark.SetAttribute('name', $script:finishedBookmarkName) | Out-Null;
$dom.speak.AppendChild($lastMark) | Out-Null;
return $dom.speak.OuterXml;
$speak = [System.Speech.Synthesis.SpeechSynthesizer]::new();
process {
if ($listVoices) {
$voices = $speak.GetInstalledVoices();
return $voices | ? { $_.Enabled -eq $true } | % {
$info = $_.VoiceInfo;
languageCode = $info.Culture.ToString();
id = $info.Id;
name = $info.Name;
ssmlGender = $info.Gender;
if ($voice) {
if ($speak.Voice.Culture.ToString() -ne $lang) {
# write warning?
# better option would be to add voice element
$lang = $speak.Voice.Culture.ToString();
if ($rate -ne 1.0) {
$speak.Rate = [math]::max(-10,
[math]::Round((9.0686 * [math]::Log($rate)) - 0.1806),
if ($volume -ne 1.0) {
$speak.Volume = [int]($volume * 100);
if ($path) {
$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
} else {
[System.IO.Path]::GetFullPath((join-path $pwd $path))
$format = [System.Speech.AudioFormat.SpeechAudioFormatInfo]::new($sampleRate, 16, $channels);
$speak.SetOutputToWaveFile($filepath, $format);
$rawtext = $text;
$text = FixSSML $text $lang;
$script:output = [PSCustomObject]@{
voice = $speak.Voice.Name;
input = $rawtext.Substring(0, [math]::Min($rawtext.Length, 2048));
rate = $rate;
volume = $volume;
duration = [int]0;
marks = @();
# $script:stats = @{
# start = get-date
# }
# $speak.Add_VisemeReached({
# param(
# [object]$sender,
# [System.Speech.Synthesis.VisemeReachedEventArgs]$evt
# );
# write-host "progress $($evt.AudioPosition.TotalMilliseconds) dur=$($evt.Duration.TotalMilliseconds) $($evt.Viseme)"
# })
$name = $evt.Bookmark;
$time = $evt.AudioPosition.TotalMilliseconds;
if ($name -eq $script:finishedBookmarkName) {
$script:output.duration = $time;
# Write-Host "done! $time";
# $script:stats.done = get-date;
} else {
$script:output.marks += [pscustomobject]@{
time = $time;
value = $name;
try {
} catch {
Write-Error "Fail! $_";
Write-Host $text;
# $script:stats.end = Get-Date;
# Write-Host "$(($stats.done - $stats.start).TotalMilliseconds) and to end is $(($stats.end - $stats.start).TotalMilliseconds)"
end {
Speak text using SSML.
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s
It uses the newer WinRT (Universal Windows Runtime) to perform synthesis, rather than the older SAPI5 .Net engine. Therefore, it'll only work on Windows 10.
Based on code from
(default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)
.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}
Name of voice to use. To get list of voices use -listVoices option
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast
Volume in range 0-1, 1 (default) is full volume
SampleRate of output WAV file. Default is 24000
Number of channels of output WAV file. Default is 1
Language to use. Default is the default voice's language.
.PARAMETER SpeechMarkTypes
Marks to include in output. Default is sentence,word,ssml. set to "" to not output any marks
System.String. You can pipe the "Text" parameter into the script
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}
PS> ./say-ssml.ps1 "hello world"
[Parameter(ValueFromPipeline = $true)] [string] $text,
[Parameter(Mandatory = $false)] [string] $voice,
[Parameter(Mandatory = $false)]
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
[Parameter(Mandatory = $false)]
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
[Parameter(Mandatory = $false)] [string] $path,
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
[Parameter(Mandatory = $false)] [int] $channels = 1,
[Parameter(Mandatory = $false)] [string] $lang,
[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml",
[Switch] $listVoices
begin {
Add-Type -AssemblyName System.Runtime.WindowsRuntime
[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime]
[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime]
$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? {
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1
$asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0];
$asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0];
Function Await($WinRtTask, $ResultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) {
$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
Function ParseMarkers($timedTextTracks) {
$list = @()
$timedTextTracks | % {
$markType = switch($_.Id) {
"SpeechWord" { "word" }
"SpeechSentence" { "sentence" }
"SpeechViseme" { "viseme" }
"SpeechBookmark" { "ssml" }
Default { "unknown" }
$_.Cues | % {
$payload = if ($_.StartPositionInInput) {
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
start = $_.StartPositionInInput
end = $_.EndPositionInInput
} else {
type = $markType
time = [int]$_.StartTime.TotalMilliseconds
value = $_.Text
if ($payload.value) {
$list += $payload;
Function PlayWave([System.Byte[]]$bytes) {
$memstream = [System.IO.MemoryStream]::new($bytes);
$player = [System.Media.SoundPlayer]::new($memstream)
Function SaveWave($path, [System.Byte[]]$bytes) {
$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
} else {
[System.IO.Path]::GetFullPath((join-path $pwd $path))
[System.IO.File]::WriteAllBytes($filepath, $bytes)
$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;
if (-not $voices.Id) {
Write-Debug "Unable to get installed voices list. Script will only use default voice";
$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice);
process {
if ($listVoices) {
return $script:voices | % {
languageCode = $_.Language
id = $_.DisplayName;
name = $_.Description;
ssmlGender = $_.Gender;
if (-not $text) {
Write-Error "No text specified";
$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new()
$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min
if ($voice) {
$voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1
if ($voiceInfo) {
$speech.Voice = $voiceInfo;
} else {
Write-Debug "No voice found matching $voice"
if ($speechMarkTypes -match 'sentence') {
$speech.Options.IncludeSentenceBoundaryMetadata = $true;
if ($speechMarkTypes -match 'words') {
$speech.Options.IncludeWordBoundaryMetadata = $true;
if ($rate -ne 1.0) {
$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0);
if ($volume -ne 1.0) {
$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0);
$ssmlNamespace = '';
if (-not $text.Trim().StartsWith('<speak')) {
$text = [System.Security.SecurityElement]::Escape($text);
$text = "<speak version=`"1.0`">$text</speak>";
$dom = [xml]$text;
$dom.speak.SetAttribute('version', '1.0');
$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language);
$dom.speak.SetAttribute('xmlns', $ssmlNamespace);
$text = $dom.speak.OuterXml;
# actually speak - create data stream
try {
$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]);
} catch {
Write-Error "Error creating stream $_";
if ($_.InnerExceptions -and $_.InnerExceptions.Count) {
$_.InnerExceptions | % {
Write-Error "$($_.GetType().Name), $($_.Message)";
if (-not $stream.Size) {
# error occurred
Write-Error "Error Creating Synthesis Stream - no results"
if ($speechMarkTypes -ne '') {
$markers = ParseMarkers $stream.TimedMetadataTracks
# create destination buffer
$bytes = [array]::CreateInstance([byte], $stream.Size);
[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes);
# wait for buffer copy
AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32])
#write out
if ($path) {
SaveWave $path $bytes;
} else {
PlayWave $bytes
end {
if ($stream) {
if ($speech) {
