Skip to content

Instantly share code, notes, and snippets.

@bobbydeveaux
Created February 3, 2017 23:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bobbydeveaux/e7793b2858a49d926734576aaec796bd to your computer and use it in GitHub Desktop.
Save bobbydeveaux/e7793b2858a49d926734576aaec796bd to your computer and use it in GitHub Desktop.
Process Word Docs
<?php
function parseWord($wordDoc)
{
$fileHandle = fopen($wordDoc, "r");
$line = @fread($fileHandle, filesize($wordDoc));
$lines = explode(chr(0x0D),$line);
$outtext = '';
foreach($lines as $line) {
$pos = strpos($line, chr(0x00));
if ((false !== $pos) || (0 === strlen($line))) {
// keep the blank lines
$outtext .= "\r\n";
} else {
$outtext .= $line." \r\n";
}
}
// strip out some shiz
$outtext = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/','',$outtext);
return $outtext;
}
print 'Lets go...' . "\r\n";
$start = microtime();
if ($handle = opendir('files/')) {
//echo "Directory handle: $handle\n";
//echo "Entries:\n";
/* This is the correct way to loop over the directory. */
while (false !== ($entry = readdir($handle))) {
if ($entry !== '.' && $entry !== '..') {
$content = parseWord('files/' . $entry);
file_put_contents('processed/' . str_replace('.doc', '.txt', $entry), $content);
}
}
closedir($handle);
}
$end = microtime()-$start;
echo 'Took ' . $end . ' seconds.' . "\r\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment