Split blocks of video transcript into paragraphs
<?php | |
/* quick and dirty paragraph splitter | |
* by Erorus | |
* | |
* this script is fully in the public domain, do what you want with it | |
* | |
* intended for use for unformatted video transcripts | |
* such as ones produced by Ars Technica, e.g. | |
* https://cdn.arstechnica.net/wp-content/uploads/2018/04/3545034e-f60c-4657-b223-6cde51b5576fcc.txt | |
*/ | |
define('PARA_LENGTH', 600); | |
$lines = file(__DIR__ . '/textblocks.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
$paragraphs = []; | |
foreach ($lines as $line) { | |
$paragraphs = array_merge($paragraphs, BuildPara($line)); | |
} | |
echo implode("\n\n", $paragraphs), "\n"; | |
function BuildPara($line) { | |
$split = preg_split('/([\.\?!][^\w\s]*)(?: |$)/', $line, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); | |
$ret = []; | |
$para = ''; | |
while (count($split)) { | |
$words = array_shift($split); | |
$punct = array_shift($split); | |
$text = $words . $punct; | |
if (substr($punct, 0, 1) == '?') { | |
// sentence is a question, force into own paragraph | |
if ($para) { | |
// close prev paragraph | |
$ret[] = $para; | |
$para = ''; | |
} | |
$ret[] = $text; // new paragraph is just the question | |
continue; | |
} | |
if ($para) { | |
$commaPos = strpos($text, ','); | |
if (strlen($para) + strlen($text) > PARA_LENGTH // hit length limit on paragraph | |
|| ($commaPos !== false && $commaPos < 10)) { // sentence starts with introductory word, probable new thought/paragraph | |
$ret[] = $para; | |
$para = ''; | |
} | |
} | |
$para .= ($para ? ' ' : '') . $text; | |
} | |
$ret[] = $para; | |
return $ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment