th0rgall/convert-otrans.py

## convert-otrans.py
'''
author: Thor Galle <@th0rgall>
original version: April 3, 2020
last updated: April 9, 2020

A script to convert a Markdown transcript exported from https://otranscribe.com/ to something importable in Atlas TI v8.

USAGE: python convert-otrans.py <input_file> [output_file]

[output_file] defaults to out.txt

Note: oTranscribe text is assumed to start with a timestamp on each line

EXAMPLE
=======

Sample text before (input)
--------------
02:03 I think it's just the way it should be when it comes to.. online streaming services

02:06 T: Yes, so you've seen this before?

02:08 Uhuh. I don't know what the other buttons do though. It's like (un) .. oh ok, now I see

Sample text after (output)
------------
[00:02:03]Participant: I think it’s just the way it should be when it comes to.. online streaming services
[00:02:06]Thor: Yes, so you’ve seen this before?
[00:02:08]Participant: Uhuh. I don’t know what the other buttons do though. It’s like [unclear] .. oh ok, now I see

HOW IT WORKS
=============
This scripts works by:
1. Cleaning and transforming the oTranscribe output so that it works for Atlas TI
2. Replacing some shorthand formulations I used with their full equivalents via regular expressions
The rules:
        T: => Thor:             (replace with your own name using the variables in the code below)
        for other lines, it inserts Participant:
        (un) => [unclear]
        (un: is it this?) => [unclear: is it this?]

Customize as needed!

Update April 9, 2020: changed to implement specification recommendations from Atlas TI tech support
        see https://www.notion.so/thorgalle/Importing-plain-text-transcripts-into-Atlas-TI-3ee3c89ce0f94c2c9199e2e1711e57b4
        for a discussion

'''

import sys, re, traceback

TYPOGRAPHIC_SINGLE_QUOTE = '’'
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '“'
TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '”'
INTERVIEWER_NAME = 'Thor'
INTERVIEWER_SHORT = 'T'
PARTICIPANT_NAME = 'Participant'
OUTPUT_DEFAULT = 'output.txt'
OTRANS_TIMESTAMP_REGEX = re.compile(r'(\d{1,2}:\d{2})')

# get the input & output files
def getFileNames():
        inputFile = None
        outputFile = OUTPUT_DEFAULT
        if (len(sys.argv) > 1):
                inputFile = sys.argv[1]
                if (len(sys.argv) > 2):
                        outputFile = sys.argv[2]
        return (inputFile, outputFile)


# rules for replacing text parts
def getReplaceTuples():
        singleQuoteR = (re.compile(r"'"), r'%s' % TYPOGRAPHIC_SINGLE_QUOTE)
        # TODO: this should converting to double quotes, but that requires more complex logic to detect start/end of a double quotation
        # and dealing with edge cases
        doubleQuoteR = (re.compile(r'"'), TYPOGRAPHIC_SINGLE_QUOTE)
        timeStampsR  = (OTRANS_TIMESTAMP_REGEX, r'[00:\1]')
        ampersandR  = (re.compile(r"\s?&\s?"), r' and ')
        unclearR1  = (re.compile(r'(\(un\))'), r'[unclear]')
        unclearR2  = (re.compile(r'\(un:([^\)]*)\)'), r'[unclear: \1]')
        interviewerR = (re.compile(r'(\s*%s:)' % INTERVIEWER_SHORT, re.I), r'%s:' % INTERVIEWER_NAME)
        colR = (re.compile(r'(\(collision\))'), r'[collision]')
        participantR = (re.compile(r'(\d{2}\])(\s*)(?!%s|\s)' % INTERVIEWER_NAME), r'\1%s: ' % PARTICIPANT_NAME)
        regexes = [singleQuoteR, ampersandR, doubleQuoteR, timeStampsR, unclearR1, unclearR2, interviewerR, colR, participantR]
        return regexes

def isValidLine(line):
        # empty line -> not valid
        if len(line) == 0:
                return False
        # no timestamp -> not valid
        elif not OTRANS_TIMESTAMP_REGEX.match(line):
                return False
        return True

def replaceLine(line):
        outLine  = line
        for i in getReplaceTuples():
                outLine = i[0].sub(i[1], outLine)
        return outLine

def main():
        (inputFile, outputFile) = getFileNames()
        with open(inputFile) as f:
                content = [line.strip() for line in f.readlines()]
                content = [replaceLine(l) for l in content if isValidLine(l)]
                with open(outputFile, 'w') as fo:
                        fo.writelines("%s\n" % l for l in content)

main()
	'''
	author: Thor Galle <@th0rgall>
	original version: April 3, 2020
	last updated: April 9, 2020

	A script to convert a Markdown transcript exported from https://otranscribe.com/ to something importable in Atlas TI v8.

	USAGE: python convert-otrans.py <input_file> [output_file]

	[output_file] defaults to out.txt

	Note: oTranscribe text is assumed to start with a timestamp on each line

	EXAMPLE
	=======

	Sample text before (input)
	--------------
	02:03 I think it's just the way it should be when it comes to.. online streaming services

	02:06 T: Yes, so you've seen this before?

	02:08 Uhuh. I don't know what the other buttons do though. It's like (un) .. oh ok, now I see

	Sample text after (output)
	------------
	[00:02:03]Participant: I think it’s just the way it should be when it comes to.. online streaming services
	[00:02:06]Thor: Yes, so you’ve seen this before?
	[00:02:08]Participant: Uhuh. I don’t know what the other buttons do though. It’s like [unclear] .. oh ok, now I see

	HOW IT WORKS
	=============
	This scripts works by:
	1. Cleaning and transforming the oTranscribe output so that it works for Atlas TI
	2. Replacing some shorthand formulations I used with their full equivalents via regular expressions
	The rules:
	T: => Thor: (replace with your own name using the variables in the code below)
	for other lines, it inserts Participant:
	(un) => [unclear]
	(un: is it this?) => [unclear: is it this?]

	Customize as needed!

	Update April 9, 2020: changed to implement specification recommendations from Atlas TI tech support
	see https://www.notion.so/thorgalle/Importing-plain-text-transcripts-into-Atlas-TI-3ee3c89ce0f94c2c9199e2e1711e57b4
	for a discussion

	'''

	import sys, re, traceback

	TYPOGRAPHIC_SINGLE_QUOTE = '’'
	TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '“'
	TYPOGRAPHIC_DOUBLE_QUOTE_BEGIN = '”'
	INTERVIEWER_NAME = 'Thor'
	INTERVIEWER_SHORT = 'T'
	PARTICIPANT_NAME = 'Participant'
	OUTPUT_DEFAULT = 'output.txt'
	OTRANS_TIMESTAMP_REGEX = re.compile(r'(\d{1,2}:\d{2})')

	# get the input & output files
	def getFileNames():
	inputFile = None
	outputFile = OUTPUT_DEFAULT
	if (len(sys.argv) > 1):
	inputFile = sys.argv[1]
	if (len(sys.argv) > 2):
	outputFile = sys.argv[2]
	return (inputFile, outputFile)


	# rules for replacing text parts
	def getReplaceTuples():
	singleQuoteR = (re.compile(r"'"), r'%s' % TYPOGRAPHIC_SINGLE_QUOTE)
	# TODO: this should converting to double quotes, but that requires more complex logic to detect start/end of a double quotation
	# and dealing with edge cases
	doubleQuoteR = (re.compile(r'"'), TYPOGRAPHIC_SINGLE_QUOTE)
	timeStampsR = (OTRANS_TIMESTAMP_REGEX, r'[00:\1]')
	ampersandR = (re.compile(r"\s?&\s?"), r' and ')
	unclearR1 = (re.compile(r'(\(un\))'), r'[unclear]')
	unclearR2 = (re.compile(r'\(un:([^\)]*)\)'), r'[unclear: \1]')
	interviewerR = (re.compile(r'(\s*%s:)' % INTERVIEWER_SHORT, re.I), r'%s:' % INTERVIEWER_NAME)
	colR = (re.compile(r'(\(collision\))'), r'[collision]')
	participantR = (re.compile(r'(\d{2}\])(\s*)(?!%s\|\s)' % INTERVIEWER_NAME), r'\1%s: ' % PARTICIPANT_NAME)
	regexes = [singleQuoteR, ampersandR, doubleQuoteR, timeStampsR, unclearR1, unclearR2, interviewerR, colR, participantR]
	return regexes

	def isValidLine(line):
	# empty line -> not valid
	if len(line) == 0:
	return False
	# no timestamp -> not valid
	elif not OTRANS_TIMESTAMP_REGEX.match(line):
	return False
	return True

	def replaceLine(line):
	outLine = line
	for i in getReplaceTuples():
	outLine = i[0].sub(i[1], outLine)
	return outLine

	def main():
	(inputFile, outputFile) = getFileNames()
	with open(inputFile) as f:
	content = [line.strip() for line in f.readlines()]
	content = [replaceLine(l) for l in content if isValidLine(l)]
	with open(outputFile, 'w') as fo:
	fo.writelines("%s\n" % l for l in content)

	main()