Skip to content

Instantly share code, notes, and snippets.

@dedmen
Created January 15, 2024 09:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dedmen/575c0d597dc2c0c3b19ea9c6c64d1794 to your computer and use it in GitHub Desktop.
Save dedmen/575c0d597dc2c0c3b19ea9c6c64d1794 to your computer and use it in GitHub Desktop.
TextToSpeech.cpp
//Header File:
#pragma once
struct ISpVoice;
struct ISpStreamFormat;
namespace Speech
{
class TextToSpeech
{
friend class TextToSpeechClass;
std::shared_ptr<ISpVoice> m_Voice;
bool m_IsSpeaking = false;
bool m_HasPendingTTSNotifications = false;
std::shared_ptr<ISpStreamFormat> m_NullStream;
public:
TextToSpeech();
/**
\brief The TextToSpeech instance needs to stay alive until speech is over https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ms717077(v=vs.85)
*/
void SpeakXML(std::string_view stuff);
/**
\brief The TextToSpeech instance needs to stay alive until speech is over https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup?tabs=csharp
*/
void SpeakSSML(std::string_view stuff);
/**
\brief Required to be called for events to fire
*/
void Tick();
void SetAudioEnabled(bool enabled);
/**
\brief Called during speech to update visual face/mouth state https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ms717289(v=vs.85)
*/
void OnMouthStateChanged(int viseme); //#TODO implement this as a event that's forwarded to script via extension callback
//! https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ee431828(v=vs.85) phoneme ID
void OnPhoneme(int durationMilliseconds, int phoneme); //#TODO implement this as a event that's forwarded to script via extension callback
/**
\brief Called when whole queued speech has completed
*/
void OnSpeechDone(); //#TODO implement this as a event that's forwarded to script via extension callback
/**
\brief Called when a XML bookmark tag has been hit
*/
void OnBookmark(std::string_view name); //#TODO implement this as a event that's forwarded to script via extension callback
void OnTTSNotification();
}
}
//Source file:
#include "TextToSpeech.h"
#include "sdkddkver.h"
// Fix undefined defines errors https://developercommunity.visualstudio.com/t/several-warnings-in-windows-sdk-100177630-in-windo/435362
#define _WIN32_WINNT_WIN10_TH2 NTDDI_WIN10_TH2
#define _WIN32_WINNT_WIN10_RS1 NTDDI_WIN10_RS1
#define _WIN32_WINNT_WIN10_RS2 NTDDI_WIN10_RS2
#define _WIN32_WINNT_WIN10_RS3 NTDDI_WIN10_RS3
#define _WIN32_WINNT_WIN10_RS4 NTDDI_WIN10_RS4
#define _WIN32_WINNT_WIN10_RS5 NTDDI_WIN10_RS5
// warning C4996: 'GetVersionExW': was declared deprecated
#pragma warning(disable: 4996)
#include "sphelper.h"
namespace Speech
{
void __stdcall SPNOTIFYCALLBACK(WPARAM wParam, LPARAM lParam)
{
reinterpret_cast<TextToSpeech*>(wParam)->OnTTSNotification();
}
TextToSpeech::TextToSpeech()
{
if( FAILED( CoInitialize(NULL) ) )
{
return;
}
// Perform application initialization:
//if (!InitInstance (hInstance, nCmdShow))
//{
// return FALSE;
//}
//hAccelTable = LoadAccelerators(hInstance, (LPCTSTR)IDC_GUIAPP);
// Main message loop:
//while (GetMessage(&msg;, NULL, 0, 0))
//{
// if (!TranslateAccelerator(msg.hwnd, hAccelTable, &msg;))
// {
// TranslateMessage(&msg;);
// DispatchMessage(&msg;);
// }
//}
//Initialize SAPI
HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, reinterpret_cast<void**>(m_Voice.Init()));
if(! SUCCEEDED( hr ) )
{
//#TODO log error
}
m_Voice->SetNotifyCallbackFunction(SPNOTIFYCALLBACK, reinterpret_cast<uintptr_t>(this), reinterpret_cast<uintptr_t>(nullptr));
auto hres = m_Voice->SetInterest(
SPFEI(SPEI_PHONEME) | SPFEI(SPEI_VISEME) | SPFEI(SPEI_TTS_BOOKMARK),
SPFEI(SPEI_PHONEME) | SPFEI(SPEI_VISEME) | SPFEI(SPEI_TTS_BOOKMARK) );
__nop();
}
void TextToSpeech::SpeakXML(std::string_view stuff)
{
if (!m_Voice)
return; // #TODO error print
#TODO convert string from UTF-8 to UTF-16
std::wstring speechStr;
// Utf8ToWideChar(speechStr, stuff.Data());
m_Voice->Speak( speechStr.Data(), SPF_ASYNC | SPF_IS_XML | SPF_PURGEBEFORESPEAK, NULL);
m_IsSpeaking = true;
}
void TextToSpeech::SpeakSSML(std::string_view stuff)
{
if (!m_Voice)
return; // #TODO error print
#TODO convert string from UTF-8 to UTF-16
std::wstring speechStr;
//Utf8ToWideChar(speechStr, stuff.Data());
m_Voice->Speak( speechStr.Data(), SPF_ASYNC | SPF_PARSE_SSML | SPF_PURGEBEFORESPEAK, NULL);
m_IsSpeaking = true;
}
void TextToSpeech::Tick()
{
if (!m_Voice || !m_IsSpeaking)
return; // #TODO error print
bool signalled = WaitForSingleObjectEx(m_Voice->SpeakCompleteEvent(), 0, true) != WAIT_TIMEOUT;
if (signalled)
{
m_IsSpeaking = false;
OnSpeechDone();
}
if (true) // m_HasPendingTTSNotifications
{
SPEVENT eventItem;
memset( &eventItem, 0,sizeof(SPEVENT));
while (m_Voice->GetEvents(1, &eventItem , NULL) == S_OK)
{
switch (eventItem.eEventId)
{
case SPEI_VISEME:
{
auto viseme = static_cast<SPVISEMES>(LOWORD(eventItem.lParam));
OnMouthStateChanged(viseme);
break;
}
case SPEI_TTS_BOOKMARK:
{
#TODO convert string from UTF-16 to UTF-8
std::string bookmarkName;
// WideCharToUtf8(bookmarkName, (wchar_t*)eventItem.lParam);
OnBookmark(bookmarkName);
break;
}
case SPEI_PHONEME:
{
OnPhoneme(HIWORD(eventItem.wParam), LOWORD(eventItem.lParam));
}
default:
break;
}
SpClearEvent(&eventItem);
}
}
}
struct NullSPStream : ISpStreamFormat
{
HRESULT QueryInterface(const IID& riid, void** ppvObject) override
{
assert(ppvObject != nullptr);
HRESULT hr = S_OK;
if (riid == __uuidof(IUnknown) || riid == __uuidof(ISpStreamFormat) || riid == __uuidof(IStream))
{
*ppvObject = this;
AddRef();
}
else
{
*ppvObject = nullptr;
hr = E_NOINTERFACE;
}
return hr;
}
ULONG AddRef() override
{
return S_OK;
}
ULONG Release() override
{
return S_OK;
}
HRESULT Read(void* pv, ULONG cb, ULONG* pcbRead) override
{
return E_NOTIMPL;
}
HRESULT Write(const void* pv, ULONG cb, ULONG* pcbWritten) override
{
return S_OK;
}
HRESULT Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER* plibNewPosition) override
{
if (plibNewPosition)
plibNewPosition->QuadPart = dlibMove.QuadPart;
return S_OK;
}
HRESULT SetSize(ULARGE_INTEGER libNewSize) override
{
return E_NOTIMPL;
}
HRESULT CopyTo(IStream* pstm, ULARGE_INTEGER cb, ULARGE_INTEGER* pcbRead, ULARGE_INTEGER* pcbWritten) override
{
return E_NOTIMPL;
}
HRESULT Commit(DWORD grfCommitFlags) override
{
return E_NOTIMPL;
}
HRESULT Revert() override
{
return E_NOTIMPL;
}
HRESULT LockRegion(ULARGE_INTEGER libOffset, ULARGE_INTEGER cb, DWORD dwLockType) override
{
return E_NOTIMPL;
}
HRESULT UnlockRegion(ULARGE_INTEGER libOffset, ULARGE_INTEGER cb, DWORD dwLockType) override
{
return E_NOTIMPL;
}
HRESULT Stat(STATSTG* pstatstg, DWORD grfStatFlag) override
{
return E_NOTIMPL;
}
HRESULT Clone(IStream** ppstm) override
{
return E_NOTIMPL;
}
HRESULT GetFormat(GUID* pguidFormatId, WAVEFORMATEX** ppCoMemWaveFormatEx) override
{
*pguidFormatId = SPDFID_WaveFormatEx;
*ppCoMemWaveFormatEx = (WAVEFORMATEX*)CoTaskMemAlloc(sizeof(WAVEFORMATEX));
(*ppCoMemWaveFormatEx)->cbSize = 0;
(*ppCoMemWaveFormatEx)->nAvgBytesPerSec = 512;
(*ppCoMemWaveFormatEx)->nBlockAlign = 1;
(*ppCoMemWaveFormatEx)->nChannels = 1;
(*ppCoMemWaveFormatEx)->nSamplesPerSec = 44100;
(*ppCoMemWaveFormatEx)->wBitsPerSample = 8;
(*ppCoMemWaveFormatEx)->wFormatTag = WAVE_FORMAT_PCM;
return S_OK;
}
};
void TextToSpeech::SetAudioEnabled(bool enabled)
{
if (!m_Voice)
return; // #TODO error print
if (enabled)
m_Voice->SetOutput(nullptr, TRUE); // Default Audio device
else
{
if (!m_NullStream)
m_NullStream = new NullSPStream();
auto res = m_Voice->SetOutput(m_NullStream, TRUE);
__nop();
}
}
void TextToSpeech::OnTTSNotification()
{
m_HasPendingTTSNotifications = true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment