Skip to content

Instantly share code, notes, and snippets.

@timfel
Created June 1, 2024 12:10
Show Gist options
  • Save timfel/2c802275493e1874b366eb33b5f32f42 to your computer and use it in GitHub Desktop.
Save timfel/2c802275493e1874b366eb33b5f32f42 to your computer and use it in GitHub Desktop.
tiptoi translations
openai-whisper
torch
transformers
numpy
soundfile
datasets
sentencepiece
sacremoses
import glob
import os
import torch
from transformers import pipeline
from datasets import load_dataset
import whisper
import soundfile
class S2S:
def __init__(self) -> None:
self.device = device = "cuda:0" if torch.cuda.is_available() else "cpu"
@property
def detect_language(self):
if not hasattr(self, "_detect_language"):
model = whisper.load_model("base")
def _detect_language(audiofile):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audiofile)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
return max(probs, key=probs.get)
self._detect_language = _detect_language
return self._detect_language
@property
def transcribe(self):
if not hasattr(self, "_transcribe"):
# model = whisper.load_model("base")
# self._transcribe = lambda audiofile: model.transcribe(audiofile)["text"]
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-german")
self._transcribe = lambda audiofile: pipe(soundfile.read(audiofile)[0])['text']
return self._transcribe
@property
def translate(self):
if not hasattr(self, "_translate"):
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en")
self._translate = lambda text: pipe(text)[0]["translation_text"]
return self._translate
@property
def speak(self):
if not hasattr(self, "_speak"):
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Dupaja/cmu-arctic-xvectors", split="validation", trust_remote_code=True)
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
def _speak_lambda(text, output):
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
soundfile.write(output, speech["audio"], samplerate=speech["sampling_rate"], format='OGG')
self._speak = _speak_lambda
return self._speak
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--input", "-i")
parser.add_argument("--output", "-o")
args = parser.parse_args()
s2s = S2S()
for filename in glob.glob(args.input):
print(filename, end=" ")
if os.path.isdir(args.output):
output = os.path.join(args.output, os.path.basename(filename))
else:
output = args.output
if "de" == s2s.detect_language(filename).lower():
original_text = s2s.transcribe(filename)
english_text = s2s.translate(original_text)
print(original_text, "->", english_text)
s2s.speak(english_text, output)
else:
print("is not detected as lang de")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment