Skip to content

Instantly share code, notes, and snippets.

@KzoNag
Created June 23, 2015 07:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KzoNag/bfe8c0f07335ce2268cc to your computer and use it in GitHub Desktop.
Save KzoNag/bfe8c0f07335ce2268cc to your computer and use it in GitHub Desktop.
using UnityEngine;
using System.Collections;
using System.Collections.Generic;
using System;
using System.IO;
using System.Text;
public class VoiceRecognizer : MonoBehaviour
{
[SerializeField]
private bool executeOnAwake = false;
[SerializeField]
private string googleApiKey;
public string GoogleApiKey{ get{ return googleApiKey; } }
[SerializeField]
private AudioClip targetClip;
public AudioClip TargetClip{ get{ return targetClip; } }
public AudioClip ConvertedClip{ get; private set; }
public byte[] WaveData{ get; private set; }
public string Result{ get{ return (state == RecognizeState.Done) ? www.text : null; } }
public string Error{ get{ return (state == RecognizeState.Done) ? www.error : null; } }
public enum RecognizeState
{
Stay,
Doing,
Done
}
private RecognizeState state = RecognizeState.Stay;
public RecognizeState State{ get{ return state; } }
private readonly int needFrequency = 16000;
private readonly int needChannels = 1;
private WWW www;
void Awake ()
{
if (executeOnAwake)
{
ExecuteRecognize();
}
}
void Update ()
{
if (state == RecognizeState.Doing && www.isDone)
{
state = RecognizeState.Done;
//LogResult();
}
}
public bool Reset()
{
if (state == RecognizeState.Doing)
{
return false;
}
state = RecognizeState.Stay;
www = null;
ConvertedClip = null;
WaveData = null;
return true;
}
/// <summary>
/// 音声認識実行リクエスト
/// </summary>
/// <returns>リクエスト実行成功・失敗</returns>
/// <param name="_targetClip">認識したい音声のAudioClip</param>
/// <param name="_googleApiKey">Google API key.</param>
public bool ExecuteRecognize(AudioClip _targetClip, string _googleApiKey)
{
targetClip = _targetClip;
googleApiKey = _googleApiKey;
return ExecuteRecognize();
}
/// <summary>
/// 音声認識実行リクエスト
/// </summary>
/// <returns>リクエスト実行成功・失敗</returns>
public bool ExecuteRecognize()
{
if (state != RecognizeState.Stay || targetClip == null || string.IsNullOrEmpty(googleApiKey))
{
return false;
}
// AudioClipを認識可能な形式に変換
ConvertedClip = ConvertSamplingAudioClip(targetClip, needChannels, needFrequency);
// AudioClipからWAVEフォーマットのデータを生成
WaveData = CreateWaveData(ConvertedClip);
// WAVEフォーマットのデータで音声認識リクエスト発行
RequestSpeechAPI(WaveData, needFrequency);
state = RecognizeState.Doing;
return true;
}
/// <summary>
/// AudioClipを任意のチャンネル、サンプリングレートに変換
/// </summary>
AudioClip ConvertSamplingAudioClip(AudioClip src, int channels, int frequency)
{
// 同一設定の場合はそのまま返す
if (src.frequency == frequency && src.channels == channels)
{
return src;
}
float[] srcData = new float[src.channels * src.samples];
src.GetData(srcData, 0);
int dataLength = Mathf.CeilToInt(frequency * channels * src.length);
float[] destData = new float[dataLength];
double rate = (double)src.frequency / (double)frequency;
int destIndex = 0;
int srcIndex = 0;
int destSampleIndex = 0;
int srcSampleIndex = 0;
int destChannelIndex = 0;
int srcChannelIndex = 0;
while(destIndex < destData.Length)
{
// 変換先が何サンプル目かを計算
destSampleIndex = destIndex / channels;
// 変換元の何サンプル目を使用するかを計算
srcSampleIndex = (int)(destSampleIndex * rate);
// 変換先のチャンネルインデックスを計算
destChannelIndex = destIndex % channels;
// 変換元で使用するチャンネルインデックスを決定
srcChannelIndex = (destChannelIndex < src.channels) ? destChannelIndex : src.channels - 1;
// 変換元のデータインデックスを決定
srcIndex = srcSampleIndex * src.channels + srcChannelIndex;
// データをセット
destData[destIndex] = srcData[srcIndex];
++destIndex;
}
AudioClip dest = AudioClip.Create("converted", dataLength, channels, frequency, false, false);
dest.SetData(destData, 0);
return dest;
}
/// <summary>
/// AudioClipからWAVEフォーマットのバイト列を生成
/// </summary>
byte[] CreateWaveData(AudioClip clip)
{
List<byte> list = new List<byte>();
// 順番は後のほうだが途中で使うので先に定義
short bit = 16;
// ヘッダ情報
string riff = "RIFF";
int fileSizeWithout8 = 0;
string wave = "WAVE";
string fmt = "fmt ";
int fmtSize = 16;
short formatID = 1;
short channels = (short)clip.channels;
int frequency = clip.frequency;
int verocity = frequency * (bit / 8) * channels;
short blockSize = (short)((bit / (short)8) * channels);
// 順番的にはここに bit = 16
string data = "data";
int dataSize = clip.channels * clip.samples * (bit / 8);
fileSizeWithout8 = dataSize + 36; // wave ~ dataSize までのバイト数を加算
// 各情報をバイト列に変換して格納する
foreach (var c in riff.ToCharArray())
{
list.Add((byte)c);
}
foreach(var b in BitConverter.GetBytes(fileSizeWithout8))
{
list.Add((byte)b);
}
foreach (var c in wave.ToCharArray())
{
list.Add((byte)c);
}
foreach (var c in fmt.ToCharArray())
{
list.Add((byte)c);
}
foreach(var b in BitConverter.GetBytes(fmtSize))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(formatID))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(channels))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(frequency))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(verocity))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(blockSize))
{
list.Add((byte)b);
}
foreach(var b in BitConverter.GetBytes(bit))
{
list.Add((byte)b);
}
foreach (var c in data.ToCharArray())
{
list.Add((byte)c);
}
foreach(var b in BitConverter.GetBytes(dataSize))
{
list.Add((byte)b);
}
float[] rawData = new float[clip.channels * clip.samples];
clip.GetData(rawData, 0);
foreach (var f in rawData)
{
foreach (var b in GetBytes(f, bit))
{
list.Add((byte)b);
}
}
return list.ToArray();
}
byte[] GetBytes(float f, int bitSize)
{
// 8bit => unsigned
if (bitSize == 8)
{
byte val = (byte)((byte)(f * byte.MaxValue) + 128);
return new byte[]{ val };
}
// 16bit => signed
else if(bitSize == 16)
{
short val = (short)(f * short.MaxValue);
return BitConverter.GetBytes(val);
}
return new byte[0];
}
/// <summary>
/// WAVEフォーマットのバイト列をファイルに保存
/// </summary>
/// <param name="waveData">Wave data.</param>
/// <param name="path">Path.</param>
void SaveWaveFile(byte[] waveData, string path)
{
var fileStreme = new FileStream(path, FileMode.Create, FileAccess.Write);
fileStreme.Write(waveData, 0, waveData.Length);
fileStreme.Close();
}
/// <summary>
/// Google Speech APIに認識リクエストを送信
/// </summary>
void RequestSpeechAPI(byte[] waveData, int frequency)
{
var url = "https://www.google.com/speech-api/v2/recognize?output=json&lang=ja&key=" + googleApiKey;
var headers = new Dictionary<string, string>();
headers.Add("Method", "POST");
headers.Add("Content-Type", "audio/l16; rate=" + frequency.ToString());
headers.Add("Content-Length", waveData.Length.ToString());
headers.Add("Accept", "application/json");
www = new WWW(url, waveData, headers);
}
void LogResult()
{
if (www == null)
{
Logger.LogError("www == null");
}
else if (!www.isDone)
{
Logger.LogError("www is doing");
}
else if (!string.IsNullOrEmpty(www.error))
{
Logger.LogError("[Error]" + www.error);
}
else
{
Logger.Log("[Success]" + www.text);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment