Last active
March 15, 2018 15:15
Speech recognition in Ruby using Google Cloud Speech and Easy Audio
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
source "https://rubygems.org" | |
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } | |
gem "easy_audio", github: "lsegal/easy_audio" | |
gem "google-cloud-speech" | |
gem "pry" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GIT | |
remote: https://github.com/lsegal/easy_audio | |
revision: ae535c2efdae6fe0b5154e1ed371955ea0c09b58 | |
specs: | |
easy_audio (0.1.0) | |
ffi-portaudio (~> 0.0) | |
GEM | |
remote: https://rubygems.org/ | |
specs: | |
addressable (2.5.2) | |
public_suffix (>= 2.0.2, < 4.0) | |
coderay (1.1.2) | |
faraday (0.14.0) | |
multipart-post (>= 1.2, < 3) | |
ffi (1.9.23) | |
ffi-portaudio (0.1.2) | |
ffi | |
google-cloud-core (1.2.0) | |
google-cloud-env (~> 1.0) | |
google-cloud-env (1.0.1) | |
faraday (~> 0.11) | |
google-cloud-speech (0.29.0) | |
google-cloud-core (~> 1.2) | |
google-gax (~> 1.0) | |
google-gax (1.0.1) | |
google-protobuf (~> 3.2) | |
googleapis-common-protos (>= 1.3.5, < 2.0) | |
googleauth (~> 0.6.2) | |
grpc (>= 1.7.2, < 2.0) | |
rly (~> 0.2.3) | |
google-protobuf (3.5.1.2) | |
googleapis-common-protos (1.3.7) | |
google-protobuf (~> 3.0) | |
googleapis-common-protos-types (~> 1.0) | |
grpc (~> 1.0) | |
googleapis-common-protos-types (1.0.1) | |
google-protobuf (~> 3.0) | |
googleauth (0.6.2) | |
faraday (~> 0.12) | |
jwt (>= 1.4, < 3.0) | |
logging (~> 2.0) | |
memoist (~> 0.12) | |
multi_json (~> 1.11) | |
os (~> 0.9) | |
signet (~> 0.7) | |
grpc (1.10.0) | |
google-protobuf (~> 3.1) | |
googleapis-common-protos-types (~> 1.0.0) | |
googleauth (>= 0.5.1, < 0.7) | |
jwt (2.1.0) | |
little-plugger (1.1.4) | |
logging (2.2.2) | |
little-plugger (~> 1.1) | |
multi_json (~> 1.10) | |
memoist (0.16.0) | |
method_source (0.9.0) | |
multi_json (1.13.1) | |
multipart-post (2.0.0) | |
os (0.9.6) | |
pry (0.11.3) | |
coderay (~> 1.1.0) | |
method_source (~> 0.9.0) | |
public_suffix (3.0.2) | |
rly (0.2.3) | |
signet (0.8.1) | |
addressable (~> 2.3) | |
faraday (~> 0.9) | |
jwt (>= 1.5, < 3.0) | |
multi_json (~> 1.10) | |
PLATFORMS | |
ruby | |
DEPENDENCIES | |
easy_audio! | |
google-cloud-speech | |
pry | |
BUNDLED WITH | |
1.16.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "bundler/setup" | |
require "easy_audio" | |
require "google/cloud/speech" | |
require "pry" | |
require "stringio" | |
require "json" | |
# | |
# Audio Recorder | |
# | |
def convert(f) | |
i = (f * 32768).to_i # (2 ** 16) / 2 | |
if i > 32767 | |
32767 | |
elsif i < -32768 | |
-32768 | |
else | |
i | |
end | |
end | |
SAMPLE_RATE = 44_100 | |
CHANNELS = 1 | |
FRAME_SIZE = 256 | |
output_buffer = nil | |
stream = EasyAudio::Stream.new({ | |
sample_rate: SAMPLE_RATE, | |
in_chans: CHANNELS, | |
frame_size: FRAME_SIZE | |
}) do |buffer| | |
if output_buffer != nil | |
int16_samples = buffer.samples.map { |sample| convert(sample) } | |
int16_samples.pack("s<#{FRAME_SIZE}", buffer: output_buffer) | |
end | |
:paContinue | |
end | |
stream.start | |
# | |
# Audio Analyzer | |
# | |
credentials = JSON.parse(File.read("./credentials.json")) | |
speech = Google::Cloud::Speech.new( | |
project_id: credentials.fetch("project_id"), | |
credentials: credentials | |
) | |
loop do | |
puts "Starting! Speak freely for five seconds!" | |
output_buffer = "".b | |
sleep 5 | |
puts "Analyzing the last 5 seconds…" | |
analyze = output_buffer | |
output_buffer = nil | |
audio = speech.audio StringIO.new(analyze), | |
encoding: :linear16, | |
language: "en-US", | |
sample_rate: SAMPLE_RATE | |
if result = audio.recognize.max(&:confidence) | |
puts "You said: #{result.transcript}" | |
else | |
puts "No idea what you said!" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment