blarghmatey/convert_transcript.py

## convert_transcript.py
from convert import (read_transcript, parse_srt_transcript, parse_text_transcript,
                     render_srt_transcript_to_vtt, render_text_transcript_to_html)

import click
from click import Path


@click.command()
@click.option('--filename', '-f', type=Path(exists=True),
              help='Source transcript to be converted')
@click.option('--output', '-o', type=Path(exists=False),
              help='Destination for writing the converted transcript')
def convert_transcript(filename, output=None):
    type_map = {'srt': 'vtt', 'txt': 'html'}
    func_map = {'in': {'srt': parse_srt_transcript,
                       'txt': parse_text_transcript},
                'out': {'vtt': render_srt_transcript_to_vtt,
                        'html': render_text_transcript_to_html}}
    ftype = filename.rsplit('.', maxsplit=1)[-1]
    out_type = type_map[ftype]
    transcript = read_transcript(filename)
    parsed = func_map['in'][ftype](transcript)
    with open(filename.replace(ftype, out_type), 'w') as converted:
        converted.write(func_map['out'][out_type](parsed))


if __name__ == '__main__':
    convert_transcript()

## srt_to_vtt_lib.py
from parsy import decimal_digit, whitespace, seq, test_char, peek, generate, regex
from parsy import string as pstring
from yattag import Doc, indent
from urllib.parse import quote
import re
import textwrap


def create_text_parser():
    word = test_char(lambda c: c.isalpha() and not re.match(r'\s+', c),
                     'A non space character').many().concat()
    name = word.skip(whitespace).many().desc('Speaker Name')
    time_segment = decimal_digit.times(1, 2).concat().map(int)
    time_sep = pstring(':')
    timestamp = time_segment.skip(time_sep.optional()).many().skip(
        whitespace).desc('Timestamp')
    section_start = seq(speaker=name, timestamp=timestamp)
    paragraph = test_char(lambda c: c != '\n',
                          'Paragraph text').many().concat()
    return section_start, paragraph


def create_srt_parser():
    cue_header = decimal_digit.at_least(1).skip(whitespace).concat()
    name = regex(r'(.*?): ').optional().desc('Speaker Name')
    time_segment = decimal_digit.times(1, 3).concat().map(int)
    timestamp = regex(r'\d{2}:\d{2}:\d{2},\d{3}').desc('Timestamp')
    time_range = seq(start=timestamp, sep=pstring(
        ' --> '), end=timestamp).skip(pstring('\n'))
    paragraph = test_char(lambda c: c != '\n',
                          'Paragraph text').many().concat()
    speech = seq(name=name, par=paragraph)
    return cue_header, time_range, speech


def read_transcript(fpath):
    with open(fpath, 'r') as t:
        transcript = t.read()
    return transcript


@generate
def text_section():
    section_start, paragraph = create_text_parser()
    pars = []
    wsfunc = whitespace.optional().map(lambda c: c if c is not None else '')
    header = yield section_start
    par = yield paragraph
    pars.append(par)
    ws = yield wsfunc
    head_test = yield peek(section_start.optional())
    while not head_test and len(ws) > 0:
        par = yield paragraph
        pars.append(par)
        ws = yield wsfunc
        head_test = yield peek(section_start.optional())
    return header, pars


@generate
def srt_section():
    cue_header, time_range, speech = create_srt_parser()
    header = yield cue_header
    timecue = yield time_range
    transcript = yield speech
    yield whitespace
    return header, timecue, transcript


def parse_text_transcript(transcript):
    return text_section.many().parse(transcript)


def parse_srt_transcript(transcript):
    return srt_section.many().parse(transcript)


def render_text_transcript_to_html(parsed_transcript):
    doc, tag, text, line = Doc().ttl()
    with tag('details'):
        with tag('summary'):
            text('Click here to read the unedited transcript...')
        for head, pars in parsed_transcript:
            with tag('h4'):
                text(' '.join(head['speaker']) + '  ')
                timestring = ':'.join([f'{t:02d}' for t in head['timestamp']])
                line('a', timestring, href=f'?t={quote(timestring)}')
            for par in pars:
                line('p', par)
    return indent(doc.getvalue())


def render_srt_transcript_to_vtt(parsed_transcript):
    vtt_header = textwrap.dedent("""WEBVTT

    """)
    vtt_doc = ''
    last_speaker = None
    for cue_header, time_range, speech in parsed_transcript:
        speaker = speech['name']
        if speaker:
            last_speaker = speaker
        else:
            speaker = last_speaker
        vtt_doc += textwrap.dedent(f'''\
        {cue_header}
        {time_range['start'].replace(',', '.')}{time_range['sep']}{time_range['end'].replace(',', '.')}
        <v {speaker.strip(': ')}>{speech['par']}

        ''')
    textwrap.dedent(vtt_doc)
    return vtt_header + vtt_doc
	from convert import (read_transcript, parse_srt_transcript, parse_text_transcript,
	render_srt_transcript_to_vtt, render_text_transcript_to_html)

	import click
	from click import Path


	@click.command()
	@click.option('--filename', '-f', type=Path(exists=True),
	help='Source transcript to be converted')
	@click.option('--output', '-o', type=Path(exists=False),
	help='Destination for writing the converted transcript')
	def convert_transcript(filename, output=None):
	type_map = {'srt': 'vtt', 'txt': 'html'}
	func_map = {'in': {'srt': parse_srt_transcript,
	'txt': parse_text_transcript},
	'out': {'vtt': render_srt_transcript_to_vtt,
	'html': render_text_transcript_to_html}}
	ftype = filename.rsplit('.', maxsplit=1)[-1]
	out_type = type_map[ftype]
	transcript = read_transcript(filename)
	parsed = func_map['in'][ftype](transcript)
	with open(filename.replace(ftype, out_type), 'w') as converted:
	converted.write(func_map['out'][out_type](parsed))


	if __name__ == '__main__':
	convert_transcript()
	from parsy import decimal_digit, whitespace, seq, test_char, peek, generate, regex
	from parsy import string as pstring
	from yattag import Doc, indent
	from urllib.parse import quote
	import re
	import textwrap


	def create_text_parser():
	word = test_char(lambda c: c.isalpha() and not re.match(r'\s+', c),
	'A non space character').many().concat()
	name = word.skip(whitespace).many().desc('Speaker Name')
	time_segment = decimal_digit.times(1, 2).concat().map(int)
	time_sep = pstring(':')
	timestamp = time_segment.skip(time_sep.optional()).many().skip(
	whitespace).desc('Timestamp')
	section_start = seq(speaker=name, timestamp=timestamp)
	paragraph = test_char(lambda c: c != '\n',
	'Paragraph text').many().concat()
	return section_start, paragraph


	def create_srt_parser():
	cue_header = decimal_digit.at_least(1).skip(whitespace).concat()
	name = regex(r'(.*?): ').optional().desc('Speaker Name')
	time_segment = decimal_digit.times(1, 3).concat().map(int)
	timestamp = regex(r'\d{2}:\d{2}:\d{2},\d{3}').desc('Timestamp')
	time_range = seq(start=timestamp, sep=pstring(
	' --> '), end=timestamp).skip(pstring('\n'))
	paragraph = test_char(lambda c: c != '\n',
	'Paragraph text').many().concat()
	speech = seq(name=name, par=paragraph)
	return cue_header, time_range, speech


	def read_transcript(fpath):
	with open(fpath, 'r') as t:
	transcript = t.read()
	return transcript


	@generate
	def text_section():
	section_start, paragraph = create_text_parser()
	pars = []
	wsfunc = whitespace.optional().map(lambda c: c if c is not None else '')
	header = yield section_start
	par = yield paragraph
	pars.append(par)
	ws = yield wsfunc
	head_test = yield peek(section_start.optional())
	while not head_test and len(ws) > 0:
	par = yield paragraph
	pars.append(par)
	ws = yield wsfunc
	head_test = yield peek(section_start.optional())
	return header, pars


	@generate
	def srt_section():
	cue_header, time_range, speech = create_srt_parser()
	header = yield cue_header
	timecue = yield time_range
	transcript = yield speech
	yield whitespace
	return header, timecue, transcript


	def parse_text_transcript(transcript):
	return text_section.many().parse(transcript)


	def parse_srt_transcript(transcript):
	return srt_section.many().parse(transcript)


	def render_text_transcript_to_html(parsed_transcript):
	doc, tag, text, line = Doc().ttl()
	with tag('details'):
	with tag('summary'):
	text('Click here to read the unedited transcript...')
	for head, pars in parsed_transcript:
	with tag('h4'):
	text(' '.join(head['speaker']) + ' ')
	timestring = ':'.join([f'{t:02d}' for t in head['timestamp']])
	line('a', timestring, href=f'?t={quote(timestring)}')
	for par in pars:
	line('p', par)
	return indent(doc.getvalue())


	def render_srt_transcript_to_vtt(parsed_transcript):
	vtt_header = textwrap.dedent("""WEBVTT

	""")
	vtt_doc = ''
	last_speaker = None
	for cue_header, time_range, speech in parsed_transcript:
	speaker = speech['name']
	if speaker:
	last_speaker = speaker
	else:
	speaker = last_speaker
	vtt_doc += textwrap.dedent(f'''\
	{cue_header}
	{time_range['start'].replace(',', '.')}{time_range['sep']}{time_range['end'].replace(',', '.')}
	<v {speaker.strip(': ')}>{speech['par']}

	''')
	textwrap.dedent(vtt_doc)
	return vtt_header + vtt_doc