Last active
January 4, 2017 13:52
-
-
Save RockyRoad29/e50000eb4df55d5bdf55c9ca0b5e9c43 to your computer and use it in GitHub Desktop.
Extract text from .vtt format subtitles (e.g. youtube) trying to group text chunks in phrases, while retaining their position.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# -------------------------------------------------------- | |
# Extracts text from .vtt format subtitles (e.g. youtube) | |
# trying to group text chunks in phrases, and output | |
# each set along with its start time position. | |
# | |
# Of course you will likely need to polish it manually, | |
# but this would be a helpful start. | |
# | |
# Debugging traces are prefixed with "!" for easy filtering. | |
# | |
# @author Michelle Baert aka RockyRoad, 2017 | |
# -------------------------------------------------------- | |
BEGIN { | |
# The minimum line length where line wrapping may occur | |
WRAPPED_AT=25; | |
# The last captured position | |
pos = 0; | |
# The current accumulated text block | |
buf = ""; | |
# The input document section | |
section = 0; | |
DEBUG = 0; | |
} | |
# other lines are ignored | |
{ | |
if (DEBUG) print "![" NR "]" $0 "!"; | |
} | |
# Capture start timestamp | |
/^00:/ { | |
if (!pos) { | |
pos = $1; | |
section = 1; | |
if (DEBUG) print "!START"; | |
} | |
next; | |
} | |
# Build and outputs blocks. | |
section { | |
# remove formatting marks | |
gsub(/<[^>]*>/,""); | |
# ignore empty lines | |
if (/^\s*$/) {next;} | |
# ignore duplicated lines | |
if ($0 == line) { | |
if (DEBUG) print "!DUP"; | |
next; | |
} | |
line = $0; | |
# Accumulate chunks | |
buf = buf " " line; | |
# Print and reset blocks as needed. | |
if ((length(line)< WRAPPED_AT) || (line ~ /\.\s*$/)) { | |
if (pos) print "\n@" pos; | |
print buf; | |
buf = ""; | |
pos = 0; | |
} | |
} | |
END { | |
# print remaining text | |
if (buf) { | |
print "\n@" pos; | |
print buf; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output (with a sample from mongodb university):