Created
April 3, 2023 07:24
-
-
Save jigangkim/c85e3dcf85472dc6c7ba702ea23ea6ca to your computer and use it in GitHub Desktop.
Speech/Subtitle generation with Google Cloud and OpenAI Whisper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Copyright 2018 Google LLC | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
# All Rights Reserved. | |
"""Adapted from Google Cloud Text-To-Speech API sample application.""" | |
import argparse | |
import glob | |
import os | |
VOICENAME = ["en-US-Neural2-J", "en-US-Studio-M"][0] | |
ENCODING_SEL = 2 # 2: MP3, 3: OGG_OPUS | |
RATE = 1.0 | |
PITCH = 0.0 | |
GAIN = 0.0 | |
def synthesize_text_file(text_file): | |
"""Synthesizes speech from the input file of text.""" | |
from google.cloud import texttospeech | |
client = texttospeech.TextToSpeechClient() | |
with open(text_file, "r") as f: | |
text = f.read() | |
input_text = texttospeech.SynthesisInput(text=text) | |
# Note: the voice can also be specified by name. | |
# Names of voices can be retrieved with client.list_voices(). | |
voice = texttospeech.VoiceSelectionParams( | |
language_code="en-US", | |
name=VOICENAME, | |
) | |
audio_config = texttospeech.AudioConfig( | |
audio_encoding=texttospeech.AudioEncoding(ENCODING_SEL), | |
speaking_rate=RATE, | |
pitch=PITCH, | |
volume_gain_db=GAIN, | |
) | |
response = client.synthesize_speech( | |
request={"input": input_text, "voice": voice, "audio_config": audio_config} | |
) | |
name = os.path.basename(text_file).split('.')[0] | |
ext = ['mp3', 'ogg'][ENCODING_SEL-2] | |
# The response's audio_content is binary. | |
with open(f"{name}.{ext}", "wb") as out: | |
out.write(response.audio_content) | |
print(f'Audio content written to file "{name}.{ext}"') | |
def synthesize_ssml_file(ssml_file): | |
"""Synthesizes speech from the input file of ssml. | |
Note: ssml must be well-formed according to: | |
https://www.w3.org/TR/speech-synthesis/ | |
""" | |
from google.cloud import texttospeech | |
client = texttospeech.TextToSpeechClient() | |
with open(ssml_file, "r") as f: | |
ssml = f.read() | |
input_text = texttospeech.SynthesisInput(ssml=ssml) | |
# Note: the voice can also be specified by name. | |
# Names of voices can be retrieved with client.list_voices(). | |
voice = texttospeech.VoiceSelectionParams( | |
language_code="en-US", | |
name=VOICENAME, | |
) | |
audio_config = texttospeech.AudioConfig( | |
audio_encoding=texttospeech.AudioEncoding(ENCODING_SEL), | |
speaking_rate=RATE, | |
pitch=PITCH, | |
volume_gain_db=GAIN, | |
) | |
response = client.synthesize_speech( | |
input=input_text, voice=voice, audio_config=audio_config | |
) | |
name = os.path.basename(ssml_file).split('.')[0] | |
ext = ['mp3', 'ogg'][ENCODING_SEL-2] | |
# The response's audio_content is binary. | |
with open(f"{name}.{ext}", "wb") as out: | |
out.write(response.audio_content) | |
print(f'Audio content written to file "{name}.{ext}"') | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--file_path', type=str, default='./') | |
parser.add_argument('--file_ext', type=str, default='.txt') | |
args = parser.parse_args() | |
list_of_scripts = [os.path.abspath(match) for match in sorted(glob.glob(args.file_path + '/*%s'%(args.file_ext)))] | |
if len(list_of_scripts) == 0: | |
print('No files found in %s'%(args.file_path)) | |
exit(1) | |
else: | |
if args.file_ext == '.txt': | |
for script in list_of_scripts: | |
synthesize_text_file(script) | |
elif args.file_ext == '.ssml': | |
for script in list_of_scripts: | |
synthesize_ssml_file(script) | |
else: | |
raise ValueError('File extension %s not supported'%(args.file_ext)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import torch | |
import whisper | |
def main(audio_path): | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = whisper.load_model("small.en", device=device) | |
writer = whisper.utils.get_writer("srt", ".") | |
result = model.transcribe(audio_path) | |
writer(result, audio_path) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--file', type=str, default='./audio.mp3') | |
args = parser.parse_args() | |
os.path.isfile(args.file) | |
main(args.file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment