Last active
June 17, 2020 06:52
-
-
Save arjan/1dfb9d601cd4423bacb7fb77e549a764 to your computer and use it in GitHub Desktop.
EAGI speech to text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// (c) Arjan Scherpenisse 2020, License: MIT | |
package main | |
import ( | |
"context" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"time" | |
"github.com/CyCoreSystems/agi" | |
speech "cloud.google.com/go/speech/apiv1" | |
texttospeech "cloud.google.com/go/texttospeech/apiv1" | |
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1" | |
texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1" | |
) | |
// Flush the EAGI buffer before recognize starts | |
func flushEAGI(a *agi.AGI) error { | |
buf := make([]byte, 1024) | |
for { | |
n, err := a.EAGI().Read(buf) | |
if n < 1024 { | |
break | |
} | |
if err != nil { | |
return err | |
} | |
} | |
return nil | |
} | |
func streamingRecognize(a *agi.AGI, timeout time.Duration) (*speechpb.StreamingRecognitionResult, bool) { | |
ctx, cancel := context.WithCancel(context.Background()) | |
defer cancel() | |
client, err := speech.NewClient(ctx) | |
if err != nil { | |
a.Verbose("Creating client failed", 1) | |
panic(err) | |
} | |
stream, err := client.StreamingRecognize(ctx) | |
if err != nil { | |
a.Verbose("Creating StreamingRecognize failed", 1) | |
panic(err) | |
} | |
// Send the initial configuration message. | |
if err := stream.Send(&speechpb.StreamingRecognizeRequest{ | |
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{ | |
StreamingConfig: &speechpb.StreamingRecognitionConfig{ | |
SingleUtterance: true, | |
InterimResults: true, | |
Config: &speechpb.RecognitionConfig{ | |
Encoding: speechpb.RecognitionConfig_LINEAR16, | |
SampleRateHertz: 8000, | |
ProfanityFilter: false, | |
LanguageCode: "nl-NL", | |
// EnableAutomaticPunctuation: true, | |
// Model: "phone_call", | |
// UseEnhanced: true, | |
}, | |
}, | |
}, | |
}); err != nil { | |
a.Verbose("Creating speech request failed", 1) | |
panic(err) | |
} | |
go func() { | |
flushEAGI(a) | |
buf := make([]byte, 1024) | |
for { | |
select { | |
case <-stream.Context().Done(): | |
a.Verbose("Exit read loop", 1) | |
return | |
default: | |
n, err := a.EAGI().Read(buf) | |
if n > 0 { | |
if err := stream.Send(&speechpb.StreamingRecognizeRequest{ | |
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{ | |
AudioContent: buf[:n], | |
}, | |
}); err != nil { | |
a.Verbose("Could not send audio", 1) | |
return | |
} | |
} | |
if err == io.EOF { | |
// Nothing else to pipe, close the stream. | |
if err := stream.CloseSend(); err != nil { | |
a.Verbose(fmt.Sprint("IO Error: %V", err), 1) | |
break | |
} | |
return | |
} | |
if err != nil { | |
a.Verbose(fmt.Sprint("IO Error: %V", err), 1) | |
break | |
} | |
} | |
} | |
}() | |
var resultChan = make(chan *speechpb.StreamingRecognitionResult) | |
var intermediateChan = make(chan *speechpb.StreamingRecognitionResult) | |
go func() { | |
for { | |
select { | |
case <-stream.Context().Done(): | |
a.Verbose("Exit result loop", 1) | |
break | |
default: | |
resp, err := stream.Recv() | |
intermediateChan <- nil | |
if err == io.EOF { | |
resultChan <- nil | |
return | |
} | |
if err != nil { | |
a.Verbose(fmt.Sprint("IO Error: %V", err), 1) | |
a.Verbose("Cannot stream results", 1) | |
} | |
if err := resp.Error; err != nil { | |
a.Verbose("Could not recognize", 1) | |
resultChan <- nil | |
return | |
} | |
for _, result := range resp.Results { | |
a.Verbose(result.String(), 1) | |
if result.IsFinal { | |
resultChan <- result | |
return | |
} else { | |
intermediateChan <- result | |
} | |
} | |
} | |
} | |
}() | |
timer := time.NewTimer(timeout) | |
var intermediateResult *speechpb.StreamingRecognitionResult | |
for { | |
select { | |
case result := <-resultChan: | |
return result, result != nil | |
case result := <-intermediateChan: | |
if result != nil { | |
intermediateResult = result | |
} | |
if intermediateResult != nil { | |
timer = time.NewTimer(1 * time.Second) | |
} | |
case <-timer.C: | |
a.Verbose("--TIMEOUT--", 1) | |
if intermediateResult != nil { | |
return intermediateResult, true | |
} | |
return nil, false | |
} | |
} | |
} | |
func textToSpeech(text, outputFile string) error { | |
ctx := context.Background() | |
client, err := texttospeech.NewClient(ctx) | |
if err != nil { | |
return err | |
} | |
req := texttospeechpb.SynthesizeSpeechRequest{ | |
Input: &texttospeechpb.SynthesisInput{ | |
InputSource: &texttospeechpb.SynthesisInput_Text{Text: text}, | |
}, | |
// Note: the voice can also be specified by name. | |
// Names of voices can be retrieved with client.ListVoices(). | |
Voice: &texttospeechpb.VoiceSelectionParams{ | |
LanguageCode: "nl-NL", | |
SsmlGender: texttospeechpb.SsmlVoiceGender_FEMALE, | |
}, | |
AudioConfig: &texttospeechpb.AudioConfig{ | |
AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16, | |
SampleRateHertz: 8000, | |
}, | |
} | |
resp, err := client.SynthesizeSpeech(ctx, &req) | |
if err != nil { | |
return err | |
} | |
err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644) | |
if err != nil { | |
return err | |
} | |
return nil | |
} | |
func main() { | |
a := agi.NewEAGI() | |
a.Answer() | |
for { | |
a.StreamFile("/sounds/beep", "", 0) | |
if result, ok := streamingRecognize(a, 5*time.Second); ok { | |
a.Verbose("----------------- YES", 1) | |
a.Verbose(result.String(), 1) | |
outfile := "/recordings/out" | |
err := textToSpeech(result.Alternatives[0].Transcript, outfile+".wav") | |
if err != nil { | |
panic(err) | |
} | |
a.StreamFile(outfile, "", 0) | |
a.Verbose("----------------- YES", 1) | |
} | |
} | |
a.Hangup() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment