arjan/eagi_stt_to_tts.go

## eagi_stt_to_tts.go
// (c) Arjan Scherpenisse 2020, License: MIT

package main

import (
	"context"
	"fmt"
	"io"
	"io/ioutil"
	"time"

	"github.com/CyCoreSystems/agi"

	speech "cloud.google.com/go/speech/apiv1"
	texttospeech "cloud.google.com/go/texttospeech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
	texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1"
)

// Flush the EAGI buffer before recognize starts
func flushEAGI(a *agi.AGI) error {
	buf := make([]byte, 1024)
	for {
		n, err := a.EAGI().Read(buf)
		if n < 1024 {
			break
		}
		if err != nil {
			return err
		}
	}

	return nil
}

func streamingRecognize(a *agi.AGI, timeout time.Duration) (*speechpb.StreamingRecognitionResult, bool) {
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	client, err := speech.NewClient(ctx)
	if err != nil {
		a.Verbose("Creating client failed", 1)
		panic(err)
	}

	stream, err := client.StreamingRecognize(ctx)
	if err != nil {
		a.Verbose("Creating StreamingRecognize failed", 1)
		panic(err)
	}
	// Send the initial configuration message.
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
			StreamingConfig: &speechpb.StreamingRecognitionConfig{
				SingleUtterance: true,
				InterimResults:  true,
				Config: &speechpb.RecognitionConfig{
					Encoding:        speechpb.RecognitionConfig_LINEAR16,
					SampleRateHertz: 8000,
					ProfanityFilter: false,
					LanguageCode:    "nl-NL",
					// EnableAutomaticPunctuation: true,
					// Model:                      "phone_call",
					// UseEnhanced:                true,
				},
			},
		},
	}); err != nil {
		a.Verbose("Creating speech request failed", 1)
		panic(err)
	}

	go func() {
		flushEAGI(a)
		buf := make([]byte, 1024)
		for {
			select {
			case <-stream.Context().Done():
				a.Verbose("Exit read loop", 1)
				return
			default:
				n, err := a.EAGI().Read(buf)

				if n > 0 {
					if err := stream.Send(&speechpb.StreamingRecognizeRequest{
						StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
							AudioContent: buf[:n],
						},
					}); err != nil {
						a.Verbose("Could not send audio", 1)
						return
					}
				}
				if err == io.EOF {
					// Nothing else to pipe, close the stream.
					if err := stream.CloseSend(); err != nil {
						a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
						break
					}
					return
				}
				if err != nil {
					a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
					break
				}
			}
		}
	}()

	var resultChan = make(chan *speechpb.StreamingRecognitionResult)
	var intermediateChan = make(chan *speechpb.StreamingRecognitionResult)

	go func() {
		for {
			select {
			case <-stream.Context().Done():
				a.Verbose("Exit result loop", 1)
				break
			default:
				resp, err := stream.Recv()

				intermediateChan <- nil

				if err == io.EOF {
					resultChan <- nil
					return
				}
				if err != nil {
					a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
					a.Verbose("Cannot stream results", 1)
				}
				if err := resp.Error; err != nil {
					a.Verbose("Could not recognize", 1)
					resultChan <- nil
					return
				}
				for _, result := range resp.Results {
					a.Verbose(result.String(), 1)
					if result.IsFinal {
						resultChan <- result
						return
					} else {
						intermediateChan <- result
					}
				}
			}
		}
	}()

	timer := time.NewTimer(timeout)
	var intermediateResult *speechpb.StreamingRecognitionResult

	for {
		select {
		case result := <-resultChan:
			return result, result != nil

		case result := <-intermediateChan:
			if result != nil {
				intermediateResult = result
			}
			if intermediateResult != nil {
				timer = time.NewTimer(1 * time.Second)
			}

		case <-timer.C:
			a.Verbose("--TIMEOUT--", 1)
			if intermediateResult != nil {
				return intermediateResult, true
			}
			return nil, false
		}
	}
}

func textToSpeech(text, outputFile string) error {
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
		return err
	}

	req := texttospeechpb.SynthesizeSpeechRequest{
		Input: &texttospeechpb.SynthesisInput{
			InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
		},
		// Note: the voice can also be specified by name.
		// Names of voices can be retrieved with client.ListVoices().
		Voice: &texttospeechpb.VoiceSelectionParams{
			LanguageCode: "nl-NL",
			SsmlGender:   texttospeechpb.SsmlVoiceGender_FEMALE,
		},
		AudioConfig: &texttospeechpb.AudioConfig{
			AudioEncoding:   texttospeechpb.AudioEncoding_LINEAR16,
			SampleRateHertz: 8000,
		},
	}

	resp, err := client.SynthesizeSpeech(ctx, &req)
	if err != nil {
		return err
	}

	err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644)
	if err != nil {
		return err
	}
	return nil
}

func main() {
	a := agi.NewEAGI()

	a.Answer()

	for {
		a.StreamFile("/sounds/beep", "", 0)

		if result, ok := streamingRecognize(a, 5*time.Second); ok {
			a.Verbose("----------------- YES", 1)
			a.Verbose(result.String(), 1)

			outfile := "/recordings/out"
			err := textToSpeech(result.Alternatives[0].Transcript, outfile+".wav")
			if err != nil {
				panic(err)
			}
			a.StreamFile(outfile, "", 0)
			a.Verbose("----------------- YES", 1)
		}
	}
	a.Hangup()
}
	// (c) Arjan Scherpenisse 2020, License: MIT

	package main

	import (
	"context"
	"fmt"
	"io"
	"io/ioutil"
	"time"

	"github.com/CyCoreSystems/agi"

	speech "cloud.google.com/go/speech/apiv1"
	texttospeech "cloud.google.com/go/texttospeech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
	texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1"
	)

	// Flush the EAGI buffer before recognize starts
	func flushEAGI(a *agi.AGI) error {
	buf := make([]byte, 1024)
	for {
	n, err := a.EAGI().Read(buf)
	if n < 1024 {
	break
	}
	if err != nil {
	return err
	}
	}

	return nil
	}

	func streamingRecognize(a agi.AGI, timeout time.Duration) (speechpb.StreamingRecognitionResult, bool) {
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	client, err := speech.NewClient(ctx)
	if err != nil {
	a.Verbose("Creating client failed", 1)
	panic(err)
	}

	stream, err := client.StreamingRecognize(ctx)
	if err != nil {
	a.Verbose("Creating StreamingRecognize failed", 1)
	panic(err)
	}
	// Send the initial configuration message.
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
	StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
	StreamingConfig: &speechpb.StreamingRecognitionConfig{
	SingleUtterance: true,
	InterimResults: true,
	Config: &speechpb.RecognitionConfig{
	Encoding: speechpb.RecognitionConfig_LINEAR16,
	SampleRateHertz: 8000,
	ProfanityFilter: false,
	LanguageCode: "nl-NL",
	// EnableAutomaticPunctuation: true,
	// Model: "phone_call",
	// UseEnhanced: true,
	},
	},
	},
	}); err != nil {
	a.Verbose("Creating speech request failed", 1)
	panic(err)
	}

	go func() {
	flushEAGI(a)
	buf := make([]byte, 1024)
	for {
	select {
	case <-stream.Context().Done():
	a.Verbose("Exit read loop", 1)
	return
	default:
	n, err := a.EAGI().Read(buf)

	if n > 0 {
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
	StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
	AudioContent: buf[:n],
	},
	}); err != nil {
	a.Verbose("Could not send audio", 1)
	return
	}
	}
	if err == io.EOF {
	// Nothing else to pipe, close the stream.
	if err := stream.CloseSend(); err != nil {
	a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
	break
	}
	return
	}
	if err != nil {
	a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
	break
	}
	}
	}
	}()

	var resultChan = make(chan *speechpb.StreamingRecognitionResult)
	var intermediateChan = make(chan *speechpb.StreamingRecognitionResult)

	go func() {
	for {
	select {
	case <-stream.Context().Done():
	a.Verbose("Exit result loop", 1)
	break
	default:
	resp, err := stream.Recv()

	intermediateChan <- nil

	if err == io.EOF {
	resultChan <- nil
	return
	}
	if err != nil {
	a.Verbose(fmt.Sprint("IO Error: %V", err), 1)
	a.Verbose("Cannot stream results", 1)
	}
	if err := resp.Error; err != nil {
	a.Verbose("Could not recognize", 1)
	resultChan <- nil
	return
	}
	for _, result := range resp.Results {
	a.Verbose(result.String(), 1)
	if result.IsFinal {
	resultChan <- result
	return
	} else {
	intermediateChan <- result
	}
	}
	}
	}
	}()

	timer := time.NewTimer(timeout)
	var intermediateResult *speechpb.StreamingRecognitionResult

	for {
	select {
	case result := <-resultChan:
	return result, result != nil

	case result := <-intermediateChan:
	if result != nil {
	intermediateResult = result
	}
	if intermediateResult != nil {
	timer = time.NewTimer(1 * time.Second)
	}

	case <-timer.C:
	a.Verbose("--TIMEOUT--", 1)
	if intermediateResult != nil {
	return intermediateResult, true
	}
	return nil, false
	}
	}
	}

	func textToSpeech(text, outputFile string) error {
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
	return err
	}

	req := texttospeechpb.SynthesizeSpeechRequest{
	Input: &texttospeechpb.SynthesisInput{
	InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
	},
	// Note: the voice can also be specified by name.
	// Names of voices can be retrieved with client.ListVoices().
	Voice: &texttospeechpb.VoiceSelectionParams{
	LanguageCode: "nl-NL",
	SsmlGender: texttospeechpb.SsmlVoiceGender_FEMALE,
	},
	AudioConfig: &texttospeechpb.AudioConfig{
	AudioEncoding: texttospeechpb.AudioEncoding_LINEAR16,
	SampleRateHertz: 8000,
	},
	}

	resp, err := client.SynthesizeSpeech(ctx, &req)
	if err != nil {
	return err
	}

	err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644)
	if err != nil {
	return err
	}
	return nil
	}

	func main() {
	a := agi.NewEAGI()

	a.Answer()

	for {
	a.StreamFile("/sounds/beep", "", 0)

	if result, ok := streamingRecognize(a, 5*time.Second); ok {
	a.Verbose("----------------- YES", 1)
	a.Verbose(result.String(), 1)

	outfile := "/recordings/out"
	err := textToSpeech(result.Alternatives[0].Transcript, outfile+".wav")
	if err != nil {
	panic(err)
	}
	a.StreamFile(outfile, "", 0)
	a.Verbose("----------------- YES", 1)
	}
	}
	a.Hangup()
	}