Skip to content

Instantly share code, notes, and snippets.

@dyadica
Last active August 29, 2015 14:02
Show Gist options
  • Save dyadica/a19e28529af956917100 to your computer and use it in GitHub Desktop.
Save dyadica/a19e28529af956917100 to your computer and use it in GitHub Desktop.
An updated version of the SpeechManager class that uses the Kinect 1.8 SDK. An example of the class in action within a WinForm App can be found via my github account
// <copyright file="SpeechManager.cs" company="dyadica.co.uk">
// Copyright (c) 2010, 2014 All Right Reserved, http://www.dyadica.co.uk
// This source is subject to the dyadica.co.uk Permissive License.
// Please see the http://www.dyadica.co.uk/permissive-license file for more information.
// All other rights reserved.
// THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// </copyright>
// <author>SJB</author>
// <email>contact via facebook.com/adropinthedigitalocean</email>
// <date>31.05.2014</date>
// <summary>A simple c# class containing code required to perform speech recognition
// with the Kinect SDK Version 1.8
// Based on and using code from http://msdn.microsoft.com/en-us/library/hh855387.aspx
// </summary
using System;
using System.Collections.Generic;
using System.Text;
using Microsoft.Kinect;
using Microsoft.Kinect.Interop;
using Microsoft.Speech;
using Microsoft.Speech.AudioFormat;
using Microsoft.Speech.Recognition;
using System.IO;
namespace Kinect_Speech_WF
{
public class SpeechManager
{
/// <summary>
/// Active Kinect sensor.
/// </summary>
public KinectSensor Sensor;
/// <summary>
/// Speech recognition engine using audio data from Kinect.
/// </summary>
public SpeechRecognitionEngine SpeechEngine;
/// <summary>
/// The path location for the required grammar file.
/// </summary>
public string GrammarFile;
/// <summary>
/// Basic constructor that accepts both a reference to the
/// Kinect sensor and the location of the grammar file to
/// be used by the speech engine.
/// </summary>
/// <param name="sensor"></param>
/// <param name="grammarFile"></param>
public SpeechManager(KinectSensor sensor, string grammarFile)
{
// Set the sensor to that passed in.
Sensor = sensor;
// Set the location of the grammar
// to be used.
GrammarFile = grammarFile;
// Set things going.
StartRecognisingSpeech();
}
/// <summary>
/// Basic constructor that accepts a reference to the
/// Kinect sensor and then generates a default path
/// to the location of the grammar file to be used by
/// the speech engine.
/// </summary>
/// <param name="sensor"></param>
public SpeechManager(KinectSensor sensor)
{
// Set the sensor to that passed in.
Sensor = sensor;
// Set the location of the grammar to
// a known default in the same location
// as the application.
GrammarFile = AppDomain.CurrentDomain.BaseDirectory + "Grammar.xml";
// Set things going.
StartRecognisingSpeech();
}
/// <summary>
/// Initialise the speech recognition and set everything
/// going.
/// </summary>
public void StartRecognisingSpeech()
{
// If we are creating a win form app then this code could
// be used to access a static instance of the kinect sensor
// however for now I just pass it in etc.
// if (Form1.Sensor != null) { this.sensor = Form1.Sensor; }
RecognizerInfo recognizerInfo = GetKinectRecognizer();
if (recognizerInfo != null)
{
SpeechEngine = new SpeechRecognitionEngine(recognizerInfo.Id);
string path = AppDomain.CurrentDomain.BaseDirectory;
GrammarBuilder builder =
new GrammarBuilder { Culture = recognizerInfo.Culture };
// The following code shows how to build a grammar file
// programaticaly. However its really easy to just load
// in a grammar file created in xml by supplying the
// path to the file. In reality this makes much more sense!
// Choices choices = new Choices();
// choices.Add(new SemanticResultValue("forward", "FORWARD"));
// choices.Add(new SemanticResultValue("forwards", "FORWARD"));
// choices.Add(new SemanticResultValue("ahead", "FORWARD"));
// choices.Add(new SemanticResultValue("backward", "BACKWARD"));
// choices.Add(new SemanticResultValue("backwards", "BACKWARD"));
// choices.Add(new SemanticResultValue("back", "BACKWARD"));
// choices.Add(new SemanticResultValue("turn left", "LEFT"));
// choices.Add(new SemanticResultValue("left", "LEFT"));
// choices.Add(new SemanticResultValue("turn right", "RIGHT"));
// choices.Add(new SemanticResultValue("right", "RIGHT"));
// builder.Append(choices);
// Grammar grammar = new Grammar(builder);
// Create a grammar from grammar definition XML file.
Grammar grammar = new Grammar(path + @"\Grammar.xml");
SpeechEngine.LoadGrammar(grammar);
// Register for events
SpeechEngine.SpeechRecognized +=
SpeechEngine_SpeechRecognized;
SpeechEngine.SpeechRecognitionRejected +=
SpeechEngine_SpeechRejected;
SpeechEngine.SpeechHypothesized +=
SpeechEngine_SpeechHypothesized;
SpeechEngine.LoadGrammarCompleted +=
SpeechEngine_LoadGrammarCompleted;
// For long recognition sessions (a few hours or more),
// it may be beneficial to turn off adaptation of the
// acoustic model. This will prevent recognition accuracy
// from degrading over time.
// speechEngine.UpdateRecognizerSetting("AdaptationOn", 0);
SpeechEngine.SetInputToAudioStream(Sensor.AudioSource.Start(),
new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null));
// Specifies that recognition does not terminate after
// completion.
SpeechEngine.RecognizeAsync(RecognizeMode.Multiple);
}
else
{
// Failed to find a recogniser so provide
// some detail here!
System.Diagnostics.Debug.WriteLine("No Speech Recogniser Found!");
}
}
/// <summary>
/// Function used to stop the speech recogniser
/// and clean up afterwards.
/// </summary>
public void StopRecognisingSpeech()
{
// Remove the events
SpeechEngine.SpeechRecognized -=
SpeechEngine_SpeechRecognized;
SpeechEngine.SpeechRecognitionRejected -=
SpeechEngine_SpeechRejected;
SpeechEngine.SpeechHypothesized -=
SpeechEngine_SpeechHypothesized;
SpeechEngine.LoadGrammarCompleted -=
SpeechEngine_LoadGrammarCompleted;
// Stop the recognition
SpeechEngine.RecognizeAsyncStop();
// SpeechEngine.Dispose();
}
/// <summary>
/// Gets the metadata for the speech recognizer (acoustic model) most suitable to
/// process audio from Kinect device.
/// </summary>
/// <returns>
/// RecognizerInfo if found, <code>null</code> otherwise.
/// </returns>
private static RecognizerInfo GetKinectRecognizer()
{
foreach (RecognizerInfo recognizer in SpeechRecognitionEngine.InstalledRecognizers())
{
string value;
recognizer.AdditionalInfo.TryGetValue("Kinect", out value);
if ("True".Equals(value, StringComparison.OrdinalIgnoreCase) && "en-US".Equals(recognizer.Culture.Name, StringComparison.OrdinalIgnoreCase))
{
return recognizer;
}
}
return null;
}
/// <summary>
/// Handler for loaded grammar speech event.
/// </summary>
/// <param name="sender">object sending the event.</param>
/// <param name="e">event arguments.</param>
private void SpeechEngine_LoadGrammarCompleted(object sender, LoadGrammarCompletedEventArgs e)
{
// throw new NotImplementedException();
System.Diagnostics.Debug.WriteLine("Grammar loaded: " + e.Grammar.Name);
}
/// <summary>
/// Handler for recognized speech events.
/// </summary>
/// <param name="sender">object sending the event.</param>
/// <param name="e">event arguments.</param>
private void SpeechEngine_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
{
// Speech utterance confidence below which we
// treat speech as if it hadn't been heard.
const double ConfidenceThreshold = 0.3;
if (e.Result.Confidence >= ConfidenceThreshold)
{
// System.Diagnostics.Debug.WriteLine(e.Result.Text);
}
}
/// <summary>
/// Handler for rejected speech events.
/// </summary>
/// <param name="sender">object sending the event.</param>
/// <param name="e">event arguments.</param>
private void SpeechEngine_SpeechRejected(object sender, SpeechRecognitionRejectedEventArgs e)
{
// System.Diagnostics.Debug.WriteLine("?");
}
/// <summary>
/// Handler for hypothesized speech events.
/// </summary>
/// <param name="sender">object sending the event.</param>
/// <param name="e">event arguments.</param>
private void SpeechEngine_SpeechHypothesized(object sender, SpeechHypothesizedEventArgs e)
{
// System.Diagnostics.Debug.WriteLine(e.Result.Text);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment