Skip to content

Instantly share code, notes, and snippets.

@dspinellis
Created October 8, 2019 10:30
Show Gist options
  • Save dspinellis/4d0a6a6e73d15a520b5c78d55414652e to your computer and use it in GitHub Desktop.
Save dspinellis/4d0a6a6e73d15a520b5c78d55414652e to your computer and use it in GitHub Desktop.
Command-line tool to convert speech in a WAV audio file into text using Windows SAPI
/*
* Convert the specified speech WAV file into text output
* on the program's standard output.
*
* Diomidis Spinellis, October 2019
* Based on https://stackoverflow.com/a/40002268/20520
*/
#include <iostream>
#include <sapi.h>
#include <sphelper.h>
int main(int argc, char* argv[])
{
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " file.wav\n";
return 1;
}
::CoInitialize(NULL);
HRESULT hr = S_OK;
CComPtr<ISpStream> cpInputStream;
CComPtr<ISpRecognizer> cpRecognizer;
CComPtr<ISpRecoContext> cpRecoContext;
CComPtr<ISpRecoGrammar> cpRecoGrammar;
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
std::string sInputFileName(argv[1]);
std::wstring wInputFileName(sInputFileName.begin(), sInputFileName.end());
hr = cpInputStream->BindToFile(wInputFileName.c_str(), SPFM_OPEN_READONLY, NULL, NULL, SPFEI_ALL_EVENTS);
if (FAILED(hr)) {
std::cerr << "Unable to open " << argv[1] << '\n';
return 1;
}
hr = cpRecognizer->SetInput(cpInputStream, TRUE);
hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
hr = cpRecoContext->CreateGrammar(NULL, &cpRecoGrammar);
hr = cpRecoGrammar->LoadDictation(NULL, SPLO_STATIC);
hr = cpRecoContext->SetNotifyWin32Event();
hr = cpRecoContext->SetInterest(SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM), SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM));
hr = cpRecoGrammar->SetDictationState(SPRS_ACTIVE);
BOOL fEndStreamReached = FALSE;
while (!fEndStreamReached && cpRecoContext->WaitForNotifyEvent(INFINITE) == S_OK) {
CSpEvent spEvent;
ISpRecoResult *pPhrase;
SPPHRASE *phrase;
while (!fEndStreamReached && spEvent.GetFrom(cpRecoContext) == S_OK) {
switch (spEvent.eEventId) {
case SPEI_RECOGNITION:
pPhrase = spEvent.RecoResult();
phrase = NULL;
pPhrase->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, NULL, NULL);
pPhrase->GetPhrase(&phrase);
if (phrase == NULL || phrase->pElements == NULL)
break;
for (int i = 0; i < phrase->Rule.ulCountOfElements; i++)
if (phrase->pElements[i].pszDisplayText != NULL)
std::wcout << phrase->pElements[i].pszDisplayText << ' ';
break;
case SPEI_END_SR_STREAM:
fEndStreamReached = TRUE;
break;
}
spEvent.Clear();
}
}
hr = cpRecoGrammar->SetDictationState(SPRS_INACTIVE);
hr = cpRecoGrammar->UnloadDictation();
hr = cpInputStream->Close();
::CoUninitialize();
std::wcout << '\n';
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment