Skip to content

Instantly share code, notes, and snippets.

@Hansimov
Last active August 13, 2023 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hansimov/07207ffce714614ac0fff5846b82a157 to your computer and use it in GitHub Desktop.
Save Hansimov/07207ffce714614ac0fff5846b82a157 to your computer and use it in GitHub Desktop.
Speech to Text with Whisper from OpenAI with Python
import torch
import whisper
from pathlib import Path
from termcolor import colored
class SpeechToTextConverter:
def __init__(self, folders=["."], exts=[".mp3"], files=None):
self.folders = folders
self.files = files
self.exts = exts
def get_filepaths(self):
if type(self.exts) is str:
self.exts = [self.exts]
if type(self.folders) is str:
self.folders = [self.folders]
if type(self.exts) is str:
self.exts = [self.exts]
self.filepaths = []
for folder in self.folders:
for ext in self.exts:
self.filepaths.extend(Path(folder).rglob(f"*{ext}"))
print(
colored(
f"Following {len(self.filepaths)} files will be converted:",
"light_magenta",
)
)
for filepath in self.filepaths:
print(f" > {filepath}")
def seconds_to_timestamp_str(self, s):
minutes = int(s / 60)
seconds = int(s) % 60
milliseconds = int((s - int(s)) * 1000)
return f"{minutes:02}:{seconds:02}.{milliseconds:03}"
def format_segments(self, segments):
texts = []
for segment in segments:
text = segment["text"]
start = segment["start"]
end = segment["end"]
start_timestamp_str = self.seconds_to_timestamp_str(start)
end_timestamp_str = self.seconds_to_timestamp_str(end)
line = f"[{start_timestamp_str} --> {end_timestamp_str}] {text}"
texts.append(text)
return "\n".join(texts)
def convert(
self,
filepath,
model_name="small",
language="en",
output_ext=".txt",
output_filepath=None,
):
self.model = whisper.load_model(name=model_name)
if not output_filepath:
output_filepath = Path(filepath).with_suffix("").with_suffix(output_ext)
filepath = str(filepath)
print(colored(f"Converting: [{filepath}]", "light_cyan"))
result = self.model.transcribe(filepath, verbose=True, language=language)
text = self.format_segments(result["segments"])
with open(output_filepath, "w") as wf:
wf.write(text)
print(colored(f"Dumped: [{output_filepath}]", "light_green"))
def run(self):
self.get_filepaths()
for filepath in self.filepaths:
self.convert(filepath)
def check_cuda():
cuda_available = torch.cuda.is_available()
cuda_version = torch.version.cuda
print(colored(f"CUDA {cuda_version} Enabled: {cuda_available} ", "light_green"))
if __name__ == "__main__":
check_cuda()
converter = SpeechToTextConverter(folders=["4th_1", "4th_2"], exts=[".mp3"])
converter.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment