stuhlmueller/convert-transcripts.py

## convert-transcripts.py
"""
Converts a directory with Rev.com transcripts into the dataset format required by Elicit.

Format for transcripts:

    https://www.rev.com/transcript-editor/Edit?token=12312

    Jane Doe:
    Good morning. Thank you everyone for joining our webinar. My name is Jon Doe.

    Jon Doe:
    Hi. My name is Jon Doe.

This is the usual Rev.com export format except the first line contains the URL of the transcript.
"""

import glob
import os

transcripts_path = "./transcripts/*.txt"

for filename in glob.glob(transcripts_path):
    with open(filename) as file:
        text = file.read()
        source_url = text.split("\n")[0]
        title = os.path.basename(filename)[:-4]
        messages = text.split("\n\n")
        for message in messages[1:]:
            lines = message.split("\n")
            first_line = lines[0]
            if not first_line:
                continue
            assert first_line.endswith(":"), first_line
            participant = first_line[:-1]
            contents = lines[1:]
            for content in contents:
                content = content.strip()
                if content:
                    print(f"\nMessage: {content}")
                    print(f"Participant: {participant}")
                    print(f"Source: [{title}]({source_url})")
	"""
	Converts a directory with Rev.com transcripts into the dataset format required by Elicit.

	Format for transcripts:

	https://www.rev.com/transcript-editor/Edit?token=12312

	Jane Doe:
	Good morning. Thank you everyone for joining our webinar. My name is Jon Doe.

	Jon Doe:
	Hi. My name is Jon Doe.

	This is the usual Rev.com export format except the first line contains the URL of the transcript.
	"""

	import glob
	import os

	transcripts_path = "./transcripts/*.txt"

	for filename in glob.glob(transcripts_path):
	with open(filename) as file:
	text = file.read()
	source_url = text.split("\n")[0]
	title = os.path.basename(filename)[:-4]
	messages = text.split("\n\n")
	for message in messages[1:]:
	lines = message.split("\n")
	first_line = lines[0]
	if not first_line:
	continue
	assert first_line.endswith(":"), first_line
	participant = first_line[:-1]
	contents = lines[1:]
	for content in contents:
	content = content.strip()
	if content:
	print(f"\nMessage: {content}")
	print(f"Participant: {participant}")
	print(f"Source: [{title}]({source_url})")