Last active
October 28, 2021 06:09
-
-
Save imjasonh/78c22b36944b8ec189456e67e63bfaa4 to your computer and use it in GitHub Desktop.
Text-to-Speech-to-Text-to-Speech ad infinitum
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"context" | |
"errors" | |
"flag" | |
"fmt" | |
"log" | |
"math/rand" | |
"os" | |
"time" | |
"golang.org/x/oauth2" | |
"google.golang.org/api/option" | |
"google.golang.org/api/transport" | |
speech "cloud.google.com/go/speech/apiv1" | |
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1" | |
ttspb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1beta1" | |
) | |
const ( | |
address = "texttospeech.googleapis.com:443" | |
) | |
var ( | |
tok = flag.String("tok", "", "OAuth token") | |
say = flag.String("say", "hello world", "Seed statement") | |
n = flag.Int("n", 10, "Iterations") | |
) | |
func main() { | |
flag.Parse() | |
rand.Seed(time.Now().Unix()) | |
ctx := context.Background() | |
// Set up a connection to the TTS server. | |
conn, err := transport.DialGRPC(ctx, | |
option.WithEndpoint(address), | |
option.WithTokenSource(oauth2.StaticTokenSource(&oauth2.Token{AccessToken: *tok}))) | |
if err != nil { | |
log.Fatalf("Dial: %v", err) | |
} | |
defer conn.Close() | |
// Set up connection to Speech server. | |
sc, err := speech.NewClient(ctx) | |
if err != nil { | |
log.Fatalf("NewClient: %v", err) | |
} | |
defer sc.Close() | |
p := &partier{ | |
tts: ttspb.NewTextToSpeechClient(conn), | |
rec: sc, | |
say: *say, | |
n: *n, | |
} | |
if err := p.partyOn(ctx); err != nil { | |
log.Fatal(err) | |
} | |
} | |
type partier struct { | |
say string | |
lang string | |
n int | |
tts ttspb.TextToSpeechClient | |
rec *speech.Client | |
} | |
func (p *partier) partyOn(ctx context.Context) error { | |
p.lang = "en-US" | |
fmt.Printf("%s said: %s\n", icon["en-US"], p.say) | |
for i := 0; i < p.n; i++ { | |
b, err := p.speak(ctx) | |
if err != nil { | |
return fmt.Errorf("TTS: %v", err) | |
} | |
if err := p.write(i, b); err != nil { | |
return fmt.Errorf("Write: %v", err) | |
} | |
p.nextLang() | |
if i < p.n-1 { | |
out, err := p.listen(ctx, b) | |
if err != nil { | |
return fmt.Errorf("Recognize: %v", err) | |
} | |
p.say = out | |
} | |
} | |
return nil | |
} | |
var icon = map[string]string{ | |
"fr-FR": "๐ซ๐ท", | |
"de-DE": "๐ฉ๐ช", | |
"fr-CA": "๐จ๐ฆ", | |
"en-GB": "๐ฌ๐ง", | |
"en-US": "๐บ๐ธ", | |
"es-ES": "๐ช๐ธ", | |
} | |
func (p *partier) nextLang() string { | |
for k := range icon { | |
if k != p.lang { | |
p.lang = k | |
return k | |
} | |
} | |
panic("unreachable") | |
} | |
func (p *partier) speak(ctx context.Context) ([]byte, error) { | |
resp, err := p.tts.SynthesizeSpeech(ctx, &ttspb.SynthesizeSpeechRequest{ | |
Input: &ttspb.SynthesisInput{ | |
InputSource: &ttspb.SynthesisInput_Text{Text: p.say}, | |
}, | |
Voice: &ttspb.VoiceSelectionParams{ | |
LanguageCode: p.lang, | |
}, | |
AudioConfig: &ttspb.AudioConfig{ | |
AudioEncoding: ttspb.AudioEncoding_LINEAR16, | |
}, | |
}) | |
if err != nil { | |
return nil, err | |
} | |
return resp.GetAudioContent(), nil | |
} | |
func (p *partier) write(i int, b []byte) error { | |
f, err := os.Create(fmt.Sprintf("step-%d.wav", i)) | |
if err != nil { | |
return err | |
} | |
defer f.Close() | |
if _, err := f.Write(b); err != nil { | |
return err | |
} | |
return nil | |
} | |
func (p *partier) listen(ctx context.Context, audio []byte) (string, error) { | |
resp, err := p.rec.Recognize(ctx, &speechpb.RecognizeRequest{ | |
Config: &speechpb.RecognitionConfig{ | |
Encoding: speechpb.RecognitionConfig_LINEAR16, | |
SampleRateHertz: 24000, | |
LanguageCode: p.lang, | |
}, | |
Audio: &speechpb.RecognitionAudio{AudioSource: &speechpb.RecognitionAudio_Content{Content: audio}}, | |
}) | |
if err != nil { | |
return "", err | |
} | |
if len(resp.GetResults()) == 0 || | |
len(resp.GetResults()[0].GetAlternatives()) == 0 { | |
return "", errors.New("no speech detected") | |
} | |
out := resp.GetResults()[0].GetAlternatives()[0].GetTranscript() | |
fmt.Printf("%s heard: %s\n", icon[p.lang], out) | |
return out, nil | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
๐บ๐ธ said: Mairzy doats and dozy doats and liddle lamzy divey | |
๐ช๐ธ heard: besito besito besito besito | |
๐จ๐ฆ heard: ben si tu veux c'est toi merci tomber si tard | |
๐ซ๐ท heard: ben si tu veux c'est toi merci tomber si tard | |
๐บ๐ธ heard: 2206 Wyman Street in Bay City | |
๐ช๐ธ heard: cita centro salud sexual Madrid | |
๐บ๐ธ heard: cheetah Central Saloon sexual married | |
๐ซ๐ท heard: si la ceinture Saint-Cyr-sur-Mer | |
๐ช๐ธ heard: fina Sรกnchez Sรกnchez | |
๐จ๐ฆ heard: Innocenti Center |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment