Skip to content

Instantly share code, notes, and snippets.

@ErikGranse
Created November 10, 2022 08:21
Show Gist options
  • Save ErikGranse/3e50f3c2d7f32ccb3e8b48efb1f81a1a to your computer and use it in GitHub Desktop.
Save ErikGranse/3e50f3c2d7f32ccb3e8b48efb1f81a1a to your computer and use it in GitHub Desktop.
Python parser for transcripts from MS teams
import re, sys
from os.path import exists
def parse_file(source):
parse_file(source, sys.stdout)
def parse_file(input_file, output_file):
author_index = 1
text_index = 2
author = ''
full_line = ''
for line in input_file:
fields = re.split('<|>', line)
if len(fields) == 5:
line_author = fields[author_index]
line_text = fields[text_index]
if line_author == author:
full_line += ' ' + line_text
else:
output_file.write(full_line + "\n")
author = line_author
full_line = line_author[2:] + '\t' + line_text
output_file.write(full_line + "\n")
def usage():
sys.exit('''
usage: python3 parser.py input_file [output_file]
input_file: the file containing the raw transcript in Web Video Text Tracks (WebVTT) format
output_file: the destination of the condensed output. If not provided, output will be written to the console.
''')
def validate_path(path):
if not exists(path):
sys.exit("The specified path %s does not exist or cannot be opened." % (path))
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
if len(sys.argv) < 3:
input_path = sys.argv[1]
validate_path(input_path)
with open(input_path) as input_file:
parse_file(input_file, sys.stdout)
else:
input_path = sys.argv[1]
output_path = sys.argv[2]
validate_path(input_path)
with open(input_path) as input_file:
with open(output_path, 'w') as output_file:
parse_file(input_file, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment