Skip to content

Instantly share code, notes, and snippets.

@goldengrape
Last active March 22, 2019 03:25
Show Gist options
  • Save goldengrape/4de4ddb51a481471deb2c1b578204646 to your computer and use it in GitHub Desktop.
Save goldengrape/4de4ddb51a481471deb2c1b578204646 to your computer and use it in GitHub Desktop.
import webvtt
import pandas as pd
filename="en.vtt"
time_epsilon=pd.Timedelta('00:00:00.1')
vtt=webvtt.read(filename)
df=pd.DataFrame([[pd.to_datetime(v.start),
pd.to_datetime(v.end),
v.text.splitlines()[-1]]
for v in vtt]
,columns=["start","end","text"])
df=df.where(df.end-df.start>time_epsilon).dropna()
print("\n".join(df.text)) # if you need all transcript
import webvtt
# inpired by Terence Eden https://shkspr.mobi/blog/2018/09/convert-webvtt-to-a-transcript-using-python/
lines = []
starts = []
ends = []
for line in vtt:
extend_text=line.text.strip().splitlines()
repeat=len(extend_text)
lines.extend(extend_text)
starts.extend([line.start] * repeat)
ends.extend([line.end] * repeat)
previous = None
new_lines=[]
new_starts=[]
new_ends=[]
for l,s,e in zip(lines,starts,ends):
if l == previous:
continue
else:
new_lines.append(l)
new_starts.append(s)
new_ends.append(e)
previous = l
print("\n".join(new_lines)) # if you need all transcript
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment