Created
February 9, 2023 03:04
-
-
Save BryantD/9eb4f7f2214ddbf7324dcd0ffd4e3cd6 to your computer and use it in GitHub Desktop.
Meet Transcript Processor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
from textwrap import wrap | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file", help="File to process") | |
args = parser.parse_args() | |
speaker = "" | |
bulk_text = "" | |
with open(args.file) as fp: | |
for line in fp: | |
line = line.strip() | |
if line: | |
token = line.split()[0] | |
if token in ("WEBVTT", "Kind:", "Language:"): | |
continue | |
elif "-->" in line: | |
continue | |
elif line[0] == "(" and line[-1] == ")": | |
new_speaker = line[1:-1] | |
if speaker != new_speaker: | |
bulk_text = bulk_text.strip() | |
if bulk_text: | |
if bulk_text[-1] not in (".", ",", "?", "-"): | |
# Handle interruptions: | |
# if a line doesn't end in punctuation, add a dash | |
bulk_text += " --" | |
if bulk_text[0] == bulk_text[0].lower(): | |
# Handle follow-on statements: | |
# if a line doesn't start with a capital letter, | |
# it's probably a follow-on | |
bulk_text = "-- " + bulk_text | |
for i in wrap( | |
f"{speaker}: {bulk_text}", subsequent_indent=" " | |
): | |
print(i) | |
print("") | |
bulk_text = "" | |
speaker = new_speaker | |
else: | |
if line == "-": | |
# single dashs on a line alone is how Meet transcripts | |
# indicate an interruption | |
if bulk_text and bulk_text[-1] in (",", "-"): | |
bulk_text = bulk_text[0:-1] + " --" | |
else: | |
bulk_text += " " + line | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment