th0rgall/convert-otter.py

## convert-otter.py
'''
author: Thor Galle <thorgalle+code@gmail.com>
original version: April 9, 2020
last update: April 9, 2020

A script to convert a .txt transcript exported from https://otter.ai/ to a format importable in Atlas TI v8.

Based on a similar script for oTranscribe I made on April 3, 2020.

USAGE
=====

A. Export your transcript in Otter.io using these settings:
- Export format: .txt
- Include speaker names: yes
- Include timestamps: yes
- Merge same-speaker segments: no
- Export as monologue: no

B. Convert the downloaded file from Otter.io with this script using the CLI:

        python convert-otter.py <input_file> [output_file]

[output_file] defaults to out.txt

C. The output file should be importable in Atlas TI

EXAMPLE
=======

Sample text before (input)
--------------------------
Thor Galle  0:49
Okay, that's a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That's all

Unknown Speaker  1:02
So

Sample text after (output)
--------------------------
[00:00:49]Thor Galle: Okay, that’s a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That’s all
[00:01:02]Unknown Speaker: So

HOW DOES THIS WORK?
====================

See this page to understand why I made this script, and why the output looks the way it does:
https://www.notion.so/thorgalle/Importing-plain-text-transcripts-into-Atlas-TI-3ee3c89ce0f94c2c9199e2e1711e57b4


TODO
====
- support for transcript of an hour or longer (00:00:00 format?)
- better support for double quotes (they're now converted to single quotes)

'''
import sys, re

TYPOGRAPHIC_SINGLE_QUOTE = '’'
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '“'
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '”'
OUTPUT_DEFAULT = 'output.txt'

# get the input & output files
def getFileNames():
        inputFile = None
        outputFile = OUTPUT_DEFAULT
        if (len(sys.argv) > 1):
                inputFile = sys.argv[1]
                if (len(sys.argv) > 2):
                        outputFile = sys.argv[2]
        return (inputFile, outputFile)


# rules for replacing text parts
def getReplaceTuples():
        singleQuoteR = (re.compile(r"'"), r'%s' % TYPOGRAPHIC_SINGLE_QUOTE)
        # TODO: this should converting to double quotes, but that requires more complex logic to detect start/end of a double quotation
        # and dealing with edge cases
        doubleQuoteR = (re.compile(r'"'), TYPOGRAPHIC_SINGLE_QUOTE)
        ampersandR  = (re.compile(r"\s?&\s?"), r' and ')
        regexes = [singleQuoteR, doubleQuoteR, ampersandR]
        return regexes

def flattenOtterTranscript(lines):
        acc = []
        for index in range (len(lines)):
                lineType = index % 3
                if (lineType == 0): # name + timestamp
                        ## add new element
                        ## TODO: hour support
                        acc.append("[00:%s]%s: " % getTimeStampAndPrefix(lines[index]))
                elif (lineType == 1): # subtitle line
                        ## add to last element
                        acc[len(acc) - 1] += replaceLine(lines[index])
                # else: leave out (empty line)
        return acc

# pad 2:01 to 02:00
def padTime(timeStr):
        oneChar = lambda x: len(x) == 1
        return ":".join([ ("0" + part if oneChar(part) else part) for part in timeStr.split(":")])


# construct the time stamp and name prefix for a subtitle
def getTimeStampAndPrefix(line):
        timeStampsR  = re.compile(r'^([\w\s]+?\b)\s*((?:\d{1,2}:)?\d{1,2}:\d{2})', re.IGNORECASE)
        match = timeStampsR.match(line)
        if match:
                # returns in the format ('0:49', 'Thor Galle')
                return (padTime(match.group(2)), match.group(1))
        else:
                print("Error: couldn't parse line")

# replaces characters that cause problems for the Atlas TI importer with valid variants
def replaceLine(line):
        outLine  = line
        for i in getReplaceTuples():
                outLine = i[0].sub(i[1], outLine)
        return outLine

def main():
        (inputFile, outputFile) = getFileNames()
        with open(inputFile) as f:
                content = [line.strip() for line in f.readlines()]
                ## prune last two lines with the irregular "Transcribed by https://otter.ai"
                content = flattenOtterTranscript(content[:-2])
                with open(outputFile, 'w') as fo:
                        fo.writelines("%s\n" % l for l in content)

main()
	'''
	author: Thor Galle <thorgalle+code@gmail.com>
	original version: April 9, 2020
	last update: April 9, 2020

	A script to convert a .txt transcript exported from https://otter.ai/ to a format importable in Atlas TI v8.

	Based on a similar script for oTranscribe I made on April 3, 2020.

	USAGE
	=====

	A. Export your transcript in Otter.io using these settings:
	- Export format: .txt
	- Include speaker names: yes
	- Include timestamps: yes
	- Merge same-speaker segments: no
	- Export as monologue: no

	B. Convert the downloaded file from Otter.io with this script using the CLI:

	python convert-otter.py <input_file> [output_file]

	[output_file] defaults to out.txt

	C. The output file should be importable in Atlas TI

	EXAMPLE
	=======

	Sample text before (input)
	--------------------------
	Thor Galle 0:49
	Okay, that's a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That's all

	Unknown Speaker 1:02
	So

	Sample text after (output)
	--------------------------
	[00:00:49]Thor Galle: Okay, that’s a good point already. Like, good. Good. Good remark. Yeah, I see only four videos. That’s all
	[00:01:02]Unknown Speaker: So

	HOW DOES THIS WORK?
	====================

	See this page to understand why I made this script, and why the output looks the way it does:
	https://www.notion.so/thorgalle/Importing-plain-text-transcripts-into-Atlas-TI-3ee3c89ce0f94c2c9199e2e1711e57b4


	TODO
	====
	- support for transcript of an hour or longer (00:00:00 format?)
	- better support for double quotes (they're now converted to single quotes)

	'''
	import sys, re

	TYPOGRAPHIC_SINGLE_QUOTE = '’'
	TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '“'
	TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '”'
	OUTPUT_DEFAULT = 'output.txt'

	# get the input & output files
	def getFileNames():
	inputFile = None
	outputFile = OUTPUT_DEFAULT
	if (len(sys.argv) > 1):
	inputFile = sys.argv[1]
	if (len(sys.argv) > 2):
	outputFile = sys.argv[2]
	return (inputFile, outputFile)


	# rules for replacing text parts
	def getReplaceTuples():
	singleQuoteR = (re.compile(r"'"), r'%s' % TYPOGRAPHIC_SINGLE_QUOTE)
	# TODO: this should converting to double quotes, but that requires more complex logic to detect start/end of a double quotation
	# and dealing with edge cases
	doubleQuoteR = (re.compile(r'"'), TYPOGRAPHIC_SINGLE_QUOTE)
	ampersandR = (re.compile(r"\s?&\s?"), r' and ')
	regexes = [singleQuoteR, doubleQuoteR, ampersandR]
	return regexes

	def flattenOtterTranscript(lines):
	acc = []
	for index in range (len(lines)):
	lineType = index % 3
	if (lineType == 0): # name + timestamp
	## add new element
	## TODO: hour support
	acc.append("[00:%s]%s: " % getTimeStampAndPrefix(lines[index]))
	elif (lineType == 1): # subtitle line
	## add to last element
	acc[len(acc) - 1] += replaceLine(lines[index])
	# else: leave out (empty line)
	return acc

	# pad 2:01 to 02:00
	def padTime(timeStr):
	oneChar = lambda x: len(x) == 1
	return ":".join([ ("0" + part if oneChar(part) else part) for part in timeStr.split(":")])


	# construct the time stamp and name prefix for a subtitle
	def getTimeStampAndPrefix(line):
	timeStampsR = re.compile(r'^([\w\s]+?\b)\s*((?:\d{1,2}:)?\d{1,2}:\d{2})', re.IGNORECASE)
	match = timeStampsR.match(line)
	if match:
	# returns in the format ('0:49', 'Thor Galle')
	return (padTime(match.group(2)), match.group(1))
	else:
	print("Error: couldn't parse line")

	# replaces characters that cause problems for the Atlas TI importer with valid variants
	def replaceLine(line):
	outLine = line
	for i in getReplaceTuples():
	outLine = i[0].sub(i[1], outLine)
	return outLine

	def main():
	(inputFile, outputFile) = getFileNames()
	with open(inputFile) as f:
	content = [line.strip() for line in f.readlines()]
	## prune last two lines with the irregular "Transcribed by https://otter.ai"
	content = flattenOtterTranscript(content[:-2])
	with open(outputFile, 'w') as fo:
	fo.writelines("%s\n" % l for l in content)

	main()