Skip to content

Instantly share code, notes, and snippets.

@digiguru
Created March 14, 2024 21:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save digiguru/2675da991f5c10c203732cd1244d559a to your computer and use it in GitHub Desktop.
Save digiguru/2675da991f5c10c203732cd1244d559a to your computer and use it in GitHub Desktop.
A simple script that will read the contents of a VTT file (transcript from zoom) and turn it into a simple script between the people talking.
import re
import sys
def parse_vtt_to_flat(filename):
try:
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# Remove WEBVTT header and empty lines
content = re.sub(r'WEBVTT.*?\n\n', '', content, flags=re.DOTALL)
content = re.sub(r'\n\n+', '\n\n', content)
# Find all dialogues
dialogues = re.findall(r'\d+\n(\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3})\n(.*?)\n', content, re.DOTALL)
# Process dialogues
last_speaker = None
dialogue_lines = []
for _, dialogue in dialogues:
speaker_match = re.match(r'([^:]+): (.*)', dialogue.replace('\n', ' '))
if speaker_match:
speaker, text = speaker_match.groups()
if speaker == last_speaker:
# Append text if the same speaker as the previous line
dialogue_lines[-1] += ' ' + text
else:
# New speaker line
dialogue_lines.append(f'{speaker}: {text}')
last_speaker = speaker
else:
# No speaker identified, treated as continuation of previous speaker or standalone text
if dialogue_lines:
dialogue_lines[-1] += ' ' + dialogue.replace('\n', ' ')
else:
dialogue_lines.append(dialogue.replace('\n', ' '))
# Write the result to a flat file
flat_filename = filename.rsplit('.', 1)[0] + '_flat.txt'
with open(flat_filename, 'w', encoding='utf-8') as outfile:
outfile.write('\n'.join(dialogue_lines))
print(f"Flat file created: {flat_filename}")
except Exception as e:
print(f"Error: {str(e)}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py <filename.vtt>")
else:
parse_vtt_to_flat(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment