lselden/out-ssml-winrt.ps1

## out-ssml-winrt.ps1
<#
.SYNOPSIS

Speak text using SSML

.DESCRIPTION

Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s

.PARAMETER Text
 (default from pipeline)
The Text to speak. Text will automatically be wrapped in <speak> if necessary.

.PARAMETER Path
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)

.PARAMETER Variable
Output to a global variable as memory stream. Output is always in WAV format (PCM s16le)

.PARAMETER listVoices
If -listVoices is passed then this function will just output a list of available
voices in the format {languageCode, id, name, ssmlGender}

.PARAMETER Voice
Name of voice to use. To get list of voices use -listVoices option

.PARAMETER Rate
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast

.PARAMETER Volume
Volume in range 0-1, 1 (default) is full volume

.PARAMETER SampleRate
SampleRate of output WAV file. Default is 24000

.PARAMETER Channels
Number of channels of output WAV file. Default is 1

.PARAMETER Lang
Language to use. Default is the default voice's language.

.PARAMETER SpeechMarkTypes
Marks to include in output. Default is sentence,words,ssml. set to "" to not output any marks


.INPUTS

System.String. You can pipe the "Text" parameter into the script

.OUTPUTS
Will play sound to speakers by default, or write to disk if -Path is specified.
Output is metadata about result:
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}

.EXAMPLE
PS> ./out-ssml-winrt.ps1 "hello world"

#>
param(
    [Parameter(ValueFromPipeline = $true)] [string] $text,
    [Parameter(Mandatory = $false)] [string] $voice,
    [Parameter(Mandatory = $false)]
        [ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
    [Parameter(Mandatory = $false)]
        [ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
    [Parameter(Mandatory = $false)] [string] $path,
    [Parameter(Mandatory = $false)] [string] $variable,
    [Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
    [Parameter(Mandatory = $false)] [int] $channels = 1,
    [Parameter(Mandatory = $false)] [string] $lang,
    [Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml",
    [Switch] $listVoices
)
begin {

    Add-Type -AssemblyName System.Runtime.WindowsRuntime

    [void][Windows.Foundation.IAsyncOperation`1,    Windows.Foundation,    ContentType=WindowsRuntime]
    [void][Windows.Foundation.IAsyncOperationWithProgress`2,    Windows.Foundation,    ContentType=WindowsRuntime]
    [void][Windows.Media.SpeechSynthesis.SpeechSynthesizer,    Windows.Media.SpeechSynthesis,    ContentType=WindowsRuntime]
    [void][Windows.Media.SpeechSynthesis.VoiceInformation,    Windows.Media.SpeechSynthesis,    ContentType=WindowsRuntime]
    [void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream,    Windows.Media.SpeechSynthesis,    ContentType=WindowsRuntime]
    [void][Windows.Media.Core.SpeechCue,    Windows.Media.Core,    ContentType=WindowsRuntime]
    [void][Windows.Media.Core.TimedMetadataTrack,    Windows.Media.Core,    ContentType=WindowsRuntime]

    $_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? {
        $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1
    }

    $asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0];
    $asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0];

    Function Await($WinRtTask, $ResultType) {
        $asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
        $netTask = $asTask.Invoke($null, @($WinRtTask))
        $netTask.Wait(-1) | Out-Null
        $netTask.Result
    }
    Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) {
        $asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2)
        $netTask = $asTask.Invoke($null, @($WinRtTask))
        $netTask.Wait(-1) | Out-Null
    }

    Function ParseMarkers($timedTextTracks) {
        $list = @()
        $timedTextTracks | % {
            $markType = switch($_.Id) {
                "SpeechWord" { "word" }
                "SpeechSentence" { "sentence" }
                "SpeechViseme" { "viseme" }
                "SpeechBookmark" { "ssml" }
                Default { "unknown" }
            }
            $_.Cues | % {
                $payload = if ($_.StartPositionInInput) {
                    [PSCustomObject]@{
                        type = $markType
                        time = [int]$_.StartTime.TotalMilliseconds
                        value = $_.Text
                        start = $_.StartPositionInInput
                        end = $_.EndPositionInInput
                    };
                } else {
                    [PSCustomObject]@{
                        type = $markType
                        time = [int]$_.StartTime.TotalMilliseconds
                        value = $_.Text
                    };
                }
                if ($payload.value) {
                    $list += $payload;
                }
            }
        }
        $list
    }

    Function PlayWave([System.Byte[]]$bytes) {
        $memstream = [System.IO.MemoryStream]::new($bytes);
        $player = [System.Media.SoundPlayer]::new($memstream)
        $player.PlaySync();
        $player.Dispose();
        $memstream.Dispose();
    }

    Function SaveWave($path, [System.Byte[]]$bytes) {
        $filepath = if ([System.IO.Path]::IsPathRooted($path)) {
            $path;
        } else {
            [System.IO.Path]::GetFullPath((join-path $pwd $path))
        }
        [System.IO.File]::WriteAllBytes($filepath, $bytes)
    }

    Function WaveToVariable($variable, [System.Byte[]]$bytes) {
        Set-Variable -Scope 'global' -Name $variable -Value $bytes;
    }

    $script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;
    if (-not $voices.Id) {
        Write-Debug "Unable to get installed voices list. Script will only use default voice";
        $voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice);
    }
}
process {
    if ($listVoices) {
        return $script:voices | % {
            [PSCustomObject]@{
                languageCode = $_.Language
                id = $_.DisplayName;
                name = $_.Description;
                ssmlGender = $_.Gender;
            }
        }
    }

    if (-not $text) {
        Write-Error "No text specified";
        return;
    }

    $speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new()
    $speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min

    if ($voice) {
        $voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1
        if ($voiceInfo) {
            $speech.Voice = $voiceInfo;
        } else {
            Write-Debug "No voice found matching $voice"
        }
    }

    if ($speechMarkTypes -match 'sentence') {
        $speech.Options.IncludeSentenceBoundaryMetadata = $true;
    }
    if ($speechMarkTypes -match 'words') {
        $speech.Options.IncludeWordBoundaryMetadata = $true;
    }

    if ($rate -ne 1.0) {
        $speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0);
    }

    if ($volume -ne 1.0) {
        $speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0);
    }

    $ssmlNamespace = 'http://www.w3.org/2001/10/synthesis';

    if (-not $text.Trim().StartsWith('<speak')) {
        $text = [System.Security.SecurityElement]::Escape($text);
        $text = "<speak version=`"1.0`">$text</speak>";
    }
    $dom = [xml]$text;
    $dom.speak.SetAttribute('version', '1.0');
    $dom.speak.SetAttribute('xml:lang', $speech.Voice.Language);
    $dom.speak.SetAttribute('xmlns', $ssmlNamespace);

    $text = $dom.speak.OuterXml;

    # actually speak - create data stream
    try {
        $stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]);
    } catch {
        Write-Error "Error creating stream $_";
        if ($_.InnerExceptions -and $_.InnerExceptions.Count) {
            $_.InnerExceptions | % {
                Write-Error "$($_.GetType().Name), $($_.Message)";
            }
        }
        return;
    }
    if (-not $stream.Size) {
        # error occurred
        Write-Error "Error Creating Synthesis Stream - no results"
        return;
    }

    if ($speechMarkTypes -ne '') {
        $markers = ParseMarkers $stream.TimedMetadataTracks
        $markers
    }
    # create destination buffer
    $bytes = [array]::CreateInstance([byte], $stream.Size);
    [Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes);
    # wait for buffer copy
    AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32])

    #write out
    if ($variable) {
        WaveToVariable $variable $bytes;
    } elseif ($path) {
        SaveWave $path $bytes;
    } else {
        PlayWave $bytes
    }
}
end {
    if ($stream) {
        $stream.Dispose();
    }
    if ($speech) {
        $speech.Dispose();
    }
}
	<#
	.SYNOPSIS

	Speak text using SSML

	.DESCRIPTION

	Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s

	.PARAMETER Text
	(default from pipeline)
	The Text to speak. Text will automatically be wrapped in <speak> if necessary.

	.PARAMETER Path
	Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le)

	.PARAMETER Variable
	Output to a global variable as memory stream. Output is always in WAV format (PCM s16le)

	.PARAMETER listVoices
	If -listVoices is passed then this function will just output a list of available
	voices in the format {languageCode, id, name, ssmlGender}

	.PARAMETER Voice
	Name of voice to use. To get list of voices use -listVoices option

	.PARAMETER Rate
	Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast

	.PARAMETER Volume
	Volume in range 0-1, 1 (default) is full volume

	.PARAMETER SampleRate
	SampleRate of output WAV file. Default is 24000

	.PARAMETER Channels
	Number of channels of output WAV file. Default is 1

	.PARAMETER Lang
	Language to use. Default is the default voice's language.

	.PARAMETER SpeechMarkTypes
	Marks to include in output. Default is sentence,words,ssml. set to "" to not output any marks


	.INPUTS

	System.String. You can pipe the "Text" parameter into the script

	.OUTPUTS
	Will play sound to speakers by default, or write to disk if -Path is specified.
	Output is metadata about result:
	{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>}

	.EXAMPLE
	PS> ./out-ssml-winrt.ps1 "hello world"

	#>
	param(
	[Parameter(ValueFromPipeline = $true)] [string] $text,
	[Parameter(Mandatory = $false)] [string] $voice,
	[Parameter(Mandatory = $false)]
	[ValidateRange(0.33, 3.0)] [double] $rate = 1.0,
	[Parameter(Mandatory = $false)]
	[ValidateRange(0.0, 1.0)] [double] $volume = 1.0,
	[Parameter(Mandatory = $false)] [string] $path,
	[Parameter(Mandatory = $false)] [string] $variable,
	[Parameter(Mandatory = $false)] [int] $sampleRate = 24000,
	[Parameter(Mandatory = $false)] [int] $channels = 1,
	[Parameter(Mandatory = $false)] [string] $lang,
	[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml",
	[Switch] $listVoices
	)
	begin {

	Add-Type -AssemblyName System.Runtime.WindowsRuntime

	[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime]
	[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime]
	[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
	[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
	[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime]
	[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime]
	[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime]

	$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() \| ? {
	$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1
	}

	$asTaskGeneric = ($_taskMethods \| ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0];
	$asTaskGeneric2 = ($_taskMethods \| ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0];

	Function Await($WinRtTask, $ResultType) {
	$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
	$netTask = $asTask.Invoke($null, @($WinRtTask))
	$netTask.Wait(-1) \| Out-Null
	$netTask.Result
	}
	Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) {
	$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2)
	$netTask = $asTask.Invoke($null, @($WinRtTask))
	$netTask.Wait(-1) \| Out-Null
	}

	Function ParseMarkers($timedTextTracks) {
	$list = @()
	$timedTextTracks \| % {
	$markType = switch($_.Id) {
	"SpeechWord" { "word" }
	"SpeechSentence" { "sentence" }
	"SpeechViseme" { "viseme" }
	"SpeechBookmark" { "ssml" }
	Default { "unknown" }
	}
	$_.Cues \| % {
	$payload = if ($_.StartPositionInInput) {
	[PSCustomObject]@{
	type = $markType
	time = [int]$_.StartTime.TotalMilliseconds
	value = $_.Text
	start = $_.StartPositionInInput
	end = $_.EndPositionInInput
	};
	} else {
	[PSCustomObject]@{
	type = $markType
	time = [int]$_.StartTime.TotalMilliseconds
	value = $_.Text
	};
	}
	if ($payload.value) {
	$list += $payload;
	}
	}
	}
	$list
	}

	Function PlayWave([System.Byte[]]$bytes) {
	$memstream = [System.IO.MemoryStream]::new($bytes);
	$player = [System.Media.SoundPlayer]::new($memstream)
	$player.PlaySync();
	$player.Dispose();
	$memstream.Dispose();
	}

	Function SaveWave($path, [System.Byte[]]$bytes) {
	$filepath = if ([System.IO.Path]::IsPathRooted($path)) {
	$path;
	} else {
	[System.IO.Path]::GetFullPath((join-path $pwd $path))
	}
	[System.IO.File]::WriteAllBytes($filepath, $bytes)
	}

	Function WaveToVariable($variable, [System.Byte[]]$bytes) {
	Set-Variable -Scope 'global' -Name $variable -Value $bytes;
	}

	$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;
	if (-not $voices.Id) {
	Write-Debug "Unable to get installed voices list. Script will only use default voice";
	$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice);
	}
	}
	process {
	if ($listVoices) {
	return $script:voices \| % {
	[PSCustomObject]@{
	languageCode = $_.Language
	id = $_.DisplayName;
	name = $_.Description;
	ssmlGender = $_.Gender;
	}
	}
	}

	if (-not $text) {
	Write-Error "No text specified";
	return;
	}

	$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new()
	$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min

	if ($voice) {
	$voiceInfo = $script:voices \| Where-Object { $_.DisplayName -imatch $voice } \| Select-Object -First 1
	if ($voiceInfo) {
	$speech.Voice = $voiceInfo;
	} else {
	Write-Debug "No voice found matching $voice"
	}
	}

	if ($speechMarkTypes -match 'sentence') {
	$speech.Options.IncludeSentenceBoundaryMetadata = $true;
	}
	if ($speechMarkTypes -match 'words') {
	$speech.Options.IncludeWordBoundaryMetadata = $true;
	}

	if ($rate -ne 1.0) {
	$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0);
	}

	if ($volume -ne 1.0) {
	$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0);
	}

	$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis';

	if (-not $text.Trim().StartsWith('<speak')) {
	$text = [System.Security.SecurityElement]::Escape($text);
	$text = "<speak version=`"1.0`">$text</speak>";
	}
	$dom = [xml]$text;
	$dom.speak.SetAttribute('version', '1.0');
	$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language);
	$dom.speak.SetAttribute('xmlns', $ssmlNamespace);

	$text = $dom.speak.OuterXml;

	# actually speak - create data stream
	try {
	$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]);
	} catch {
	Write-Error "Error creating stream $_";
	if ($_.InnerExceptions -and $_.InnerExceptions.Count) {
	$_.InnerExceptions \| % {
	Write-Error "$($_.GetType().Name), $($_.Message)";
	}
	}
	return;
	}
	if (-not $stream.Size) {
	# error occurred
	Write-Error "Error Creating Synthesis Stream - no results"
	return;
	}

	if ($speechMarkTypes -ne '') {
	$markers = ParseMarkers $stream.TimedMetadataTracks
	$markers
	}
	# create destination buffer
	$bytes = [array]::CreateInstance([byte], $stream.Size);
	[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes);
	# wait for buffer copy
	AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32])

	#write out
	if ($variable) {
	WaveToVariable $variable $bytes;
	} elseif ($path) {
	SaveWave $path $bytes;
	} else {
	PlayWave $bytes
	}
	}
	end {
	if ($stream) {
	$stream.Dispose();
	}
	if ($speech) {
	$speech.Dispose();
	}
	}