Skip to content

Instantly share code, notes, and snippets.

@blarghmatey
Created June 17, 2021 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blarghmatey/3b75adcdb101e47d10b95ff960022051 to your computer and use it in GitHub Desktop.
Save blarghmatey/3b75adcdb101e47d10b95ff960022051 to your computer and use it in GitHub Desktop.
from convert import (read_transcript, parse_srt_transcript, parse_text_transcript,
render_srt_transcript_to_vtt, render_text_transcript_to_html)
import click
from click import Path
@click.command()
@click.option('--filename', '-f', type=Path(exists=True),
help='Source transcript to be converted')
@click.option('--output', '-o', type=Path(exists=False),
help='Destination for writing the converted transcript')
def convert_transcript(filename, output=None):
type_map = {'srt': 'vtt', 'txt': 'html'}
func_map = {'in': {'srt': parse_srt_transcript,
'txt': parse_text_transcript},
'out': {'vtt': render_srt_transcript_to_vtt,
'html': render_text_transcript_to_html}}
ftype = filename.rsplit('.', maxsplit=1)[-1]
out_type = type_map[ftype]
transcript = read_transcript(filename)
parsed = func_map['in'][ftype](transcript)
with open(filename.replace(ftype, out_type), 'w') as converted:
converted.write(func_map['out'][out_type](parsed))
if __name__ == '__main__':
convert_transcript()
from parsy import decimal_digit, whitespace, seq, test_char, peek, generate, regex
from parsy import string as pstring
from yattag import Doc, indent
from urllib.parse import quote
import re
import textwrap
def create_text_parser():
word = test_char(lambda c: c.isalpha() and not re.match(r'\s+', c),
'A non space character').many().concat()
name = word.skip(whitespace).many().desc('Speaker Name')
time_segment = decimal_digit.times(1, 2).concat().map(int)
time_sep = pstring(':')
timestamp = time_segment.skip(time_sep.optional()).many().skip(
whitespace).desc('Timestamp')
section_start = seq(speaker=name, timestamp=timestamp)
paragraph = test_char(lambda c: c != '\n',
'Paragraph text').many().concat()
return section_start, paragraph
def create_srt_parser():
cue_header = decimal_digit.at_least(1).skip(whitespace).concat()
name = regex(r'(.*?): ').optional().desc('Speaker Name')
time_segment = decimal_digit.times(1, 3).concat().map(int)
timestamp = regex(r'\d{2}:\d{2}:\d{2},\d{3}').desc('Timestamp')
time_range = seq(start=timestamp, sep=pstring(
' --> '), end=timestamp).skip(pstring('\n'))
paragraph = test_char(lambda c: c != '\n',
'Paragraph text').many().concat()
speech = seq(name=name, par=paragraph)
return cue_header, time_range, speech
def read_transcript(fpath):
with open(fpath, 'r') as t:
transcript = t.read()
return transcript
@generate
def text_section():
section_start, paragraph = create_text_parser()
pars = []
wsfunc = whitespace.optional().map(lambda c: c if c is not None else '')
header = yield section_start
par = yield paragraph
pars.append(par)
ws = yield wsfunc
head_test = yield peek(section_start.optional())
while not head_test and len(ws) > 0:
par = yield paragraph
pars.append(par)
ws = yield wsfunc
head_test = yield peek(section_start.optional())
return header, pars
@generate
def srt_section():
cue_header, time_range, speech = create_srt_parser()
header = yield cue_header
timecue = yield time_range
transcript = yield speech
yield whitespace
return header, timecue, transcript
def parse_text_transcript(transcript):
return text_section.many().parse(transcript)
def parse_srt_transcript(transcript):
return srt_section.many().parse(transcript)
def render_text_transcript_to_html(parsed_transcript):
doc, tag, text, line = Doc().ttl()
with tag('details'):
with tag('summary'):
text('Click here to read the unedited transcript...')
for head, pars in parsed_transcript:
with tag('h4'):
text(' '.join(head['speaker']) + ' ')
timestring = ':'.join([f'{t:02d}' for t in head['timestamp']])
line('a', timestring, href=f'?t={quote(timestring)}')
for par in pars:
line('p', par)
return indent(doc.getvalue())
def render_srt_transcript_to_vtt(parsed_transcript):
vtt_header = textwrap.dedent("""WEBVTT
""")
vtt_doc = ''
last_speaker = None
for cue_header, time_range, speech in parsed_transcript:
speaker = speech['name']
if speaker:
last_speaker = speaker
else:
speaker = last_speaker
vtt_doc += textwrap.dedent(f'''\
{cue_header}
{time_range['start'].replace(',', '.')}{time_range['sep']}{time_range['end'].replace(',', '.')}
<v {speaker.strip(': ')}>{speech['par']}
''')
textwrap.dedent(vtt_doc)
return vtt_header + vtt_doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment