Last active
March 22, 2019 03:25
-
-
Save goldengrape/4de4ddb51a481471deb2c1b578204646 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import webvtt | |
import pandas as pd | |
filename="en.vtt" | |
time_epsilon=pd.Timedelta('00:00:00.1') | |
vtt=webvtt.read(filename) | |
df=pd.DataFrame([[pd.to_datetime(v.start), | |
pd.to_datetime(v.end), | |
v.text.splitlines()[-1]] | |
for v in vtt] | |
,columns=["start","end","text"]) | |
df=df.where(df.end-df.start>time_epsilon).dropna() | |
print("\n".join(df.text)) # if you need all transcript |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import webvtt | |
# inpired by Terence Eden https://shkspr.mobi/blog/2018/09/convert-webvtt-to-a-transcript-using-python/ | |
lines = [] | |
starts = [] | |
ends = [] | |
for line in vtt: | |
extend_text=line.text.strip().splitlines() | |
repeat=len(extend_text) | |
lines.extend(extend_text) | |
starts.extend([line.start] * repeat) | |
ends.extend([line.end] * repeat) | |
previous = None | |
new_lines=[] | |
new_starts=[] | |
new_ends=[] | |
for l,s,e in zip(lines,starts,ends): | |
if l == previous: | |
continue | |
else: | |
new_lines.append(l) | |
new_starts.append(s) | |
new_ends.append(e) | |
previous = l | |
print("\n".join(new_lines)) # if you need all transcript |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment