Skip to content

Instantly share code, notes, and snippets.

@arjan
Last active June 17, 2020 06:52
Show Gist options
  • Save arjan/1dfb9d601cd4423bacb7fb77e549a764 to your computer and use it in GitHub Desktop.
Save arjan/1dfb9d601cd4423bacb7fb77e549a764 to your computer and use it in GitHub Desktop.
EAGI speech to text
// (c) Arjan Scherpenisse 2020, License: MIT
package main
import (
"context"
"fmt"
"io"
"io/ioutil"
"time"
"github.com/CyCoreSystems/agi"
speech "cloud.google.com/go/speech/apiv1"
texttospeech "cloud.google.com/go/texttospeech/apiv1"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1"
)
// Flush the EAGI buffer before recognize starts
func flushEAGI(a *agi.AGI) error {
buf := make([]byte, 1024)
for {
n, err := a.EAGI().Read(buf)
if n < 1024 {
break
}
if err != nil {
return err
}
}
return nil
}
func streamingRecognize(a *agi.AGI, timeout time.Duration) (*speechpb.StreamingRecognitionResult, bool) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
client, err := speech.NewClient(ctx)
if err != nil {
a.Verbose("Creating client failed", 1)
panic(err)
}
stream, err := client.StreamingRecognize(ctx)
if err != nil {
a.Verbose("Creating StreamingRecognize failed", 1)
panic(err)
}
// Send the initial configuration message.
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
SingleUtterance: true,
InterimResults: true,
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 8000,
ProfanityFilter: false,
LanguageCode: "nl-NL",
// EnableAutomaticPunctuation: true,
// Model: "phone_call",
// UseEnhanced: true,
},
},
},
}); err != nil {
a.Verbose("Creating speech request failed", 1)
panic(err)
}
go func() {
flushEAGI(a)
buf := make([]byte, 1024)
for {
select {
case <-stream.Context().Done():
a.Verbose("Exit read loop", 1)
return
default:
n, err := a.EAGI().Read(buf)
if n > 0 {
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: buf[:n],
},
}); err != nil {
a.Verbose("Could not send audio", 1)
return
}
}
if err == io.EOF {
// Nothing else to pipe, close the stream.
if err := stream.CloseSend(); err != nil {
a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
break
}
return
}
if err != nil {
a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
break
}
}
}
}()
var resultChan = make(chan *speechpb.StreamingRecognitionResult)
var intermediateChan = make(chan *speechpb.StreamingRecognitionResult)
go func() {
for {
select {
case <-stream.Context().Done():
a.Verbose("Exit result loop", 1)
break
default:
resp, err := stream.Recv()
intermediateChan <- nil
if err == io.EOF {
resultChan <- nil
return
}
if err != nil {
a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
a.Verbose("Cannot stream results", 1)
}
if err := resp.Error; err != nil {
a.Verbose("Could not recognize", 1)
resultChan <- nil
return
}
for _, result := range resp.Results {
a.Verbose(result.String(), 1)
if result.IsFinal {
resultChan <- result
return
} else {
intermediateChan <- result
}
}
}
}
}()
timer := time.NewTimer(timeout)
var intermediateResult *speechpb.StreamingRecognitionResult
for {
select {
case result := <-resultChan:
return result, result != nil
case result := <-intermediateChan:
if result != nil {
intermediateResult = result
}
if intermediateResult != nil {
timer = time.NewTimer(1 * time.Second)
}
case <-timer.C:
a.Verbose("--TIMEOUT--", 1)
if intermediateResult != nil {
return intermediateResult, true
}
return nil, false
}
}
}
func textToSpeech(text, outputFile string) error {
ctx := context.Background()
client, err := texttospeech.NewClient(ctx)
if err != nil {
return err
}
req := texttospeechpb.SynthesizeSpeechRequest{
Input: &texttospeechpb.SynthesisInput{
InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
},
// Note: the voice can also be specified by name.
// Names of voices can be retrieved with client.ListVoices().
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: "nl-NL",
SsmlGender: texttospeechpb.SsmlVoiceGender_FEMALE,
},
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16,
SampleRateHertz: 8000,
},
}
resp, err := client.SynthesizeSpeech(ctx, &req)
if err != nil {
return err
}
err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644)
if err != nil {
return err
}
return nil
}
func main() {
a := agi.NewEAGI()
a.Answer()
for {
a.StreamFile("/sounds/beep", "", 0)
if result, ok := streamingRecognize(a, 5*time.Second); ok {
a.Verbose("----------------- YES", 1)
a.Verbose(result.String(), 1)
outfile := "/recordings/out"
err := textToSpeech(result.Alternatives[0].Transcript, outfile+".wav")
if err != nil {
panic(err)
}
a.StreamFile(outfile, "", 0)
a.Verbose("----------------- YES", 1)
}
}
a.Hangup()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment