Created
February 3, 2024 08:28
-
-
Save cr2007/41799701d923d18603649fb3a261b341 to your computer and use it in GitHub Desktop.
π
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import pathlib | |
import subprocess | |
import time | |
import whisper | |
import yt_dlp | |
from whisper.utils import get_writer | |
if not os.path.exists("input"): | |
os.makedirs("input") | |
if os.path.exists("input/video.mp4"): | |
os.remove("input/video.mp4") | |
URLS = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"] | |
ydl_opts = { | |
"outtmpl": "input/video.%(ext)s", | |
"format": "mp4", | |
"overwrite": "True", | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
error_code = ydl.download(URLS) | |
# Choose model to use by uncommenting | |
# modelName = "tiny.en" | |
# modelName = "base.en" | |
# modelName = "small.en" | |
modelName = "medium.en" | |
# modelName = "large-v2" | |
# Other Variables | |
exportTimestampData = True # (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True. | |
outputFolder = "output" | |
# ----- Select variables for transcribe method ----- | |
# audio: path to audio file | |
verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything | |
language = "english" # Language of audio file | |
word_timestamps = True # (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment. | |
# initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly. | |
# ------------------------------------------------------------------------- | |
print(f"Using Model: {modelName}") | |
filePath = "input/video.mp4" | |
if not os.path.exists(filePath): | |
print("Problem Getting File...") | |
input("Press Enter to Exit...") | |
exit() | |
# If output folder does not exist, create it | |
if not os.path.exists(outputFolder): | |
os.makedirs(outputFolder) | |
print("Created Output Folder.\n") | |
# Get filename stem using pathlib (filename without extension) | |
fileNameStem = pathlib.Path(filePath).stem | |
resultFileName = f"{fileNameStem}.txt" | |
jsonFileName = f"{fileNameStem}.json" | |
model = whisper.load_model(modelName) | |
start = time.time() | |
# --------------------------------------------------- | |
result = model.transcribe( | |
audio=filePath, | |
language=language, | |
word_timestamps=word_timestamps, | |
verbose=verbose, | |
) | |
# --------------------------------------------------- | |
end = time.time() | |
elapsed = float(end - start) | |
# Save transcription text to file | |
print("\nWriting transcription to file...") | |
with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file: | |
file.write(result["text"]) | |
print("Finished writing transcription file.") | |
# Save the segments data to json file | |
# if word_timestamps == True: | |
if exportTimestampData is True: | |
print("\nWriting segment data to file...") | |
with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file: | |
segmentsData = result["segments"] | |
json.dump(segmentsData, file, indent=4) | |
print("Finished writing segment data file.") | |
word_options = { | |
"highlight_words": True, | |
} | |
srt_writer = get_writer("srt", outputFolder) | |
srt_writer(result, filePath, word_options) | |
print(f"\nElapsed Time With {modelName} Model: {elapsed:.2f} seconds") | |
# Define file paths | |
video_path = "input/video.mp4" | |
subtitle_path = "output/video.srt" | |
output_path = "output/output.mp4" | |
overlay_subtitles = [ | |
"ffmpeg", | |
"-i", | |
video_path, | |
"-vf", | |
f"subtitles={subtitle_path}:force_style='FontName=JetBrainsMono Nerd Font, Regular'", | |
output_path, | |
"-y", | |
] | |
subprocess.run(overlay_subtitles) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment