Created
February 3, 2017 23:00
-
-
Save bobbydeveaux/e7793b2858a49d926734576aaec796bd to your computer and use it in GitHub Desktop.
Process Word Docs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function parseWord($wordDoc) | |
{ | |
$fileHandle = fopen($wordDoc, "r"); | |
$line = @fread($fileHandle, filesize($wordDoc)); | |
$lines = explode(chr(0x0D),$line); | |
$outtext = ''; | |
foreach($lines as $line) { | |
$pos = strpos($line, chr(0x00)); | |
if ((false !== $pos) || (0 === strlen($line))) { | |
// keep the blank lines | |
$outtext .= "\r\n"; | |
} else { | |
$outtext .= $line." \r\n"; | |
} | |
} | |
// strip out some shiz | |
$outtext = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/','',$outtext); | |
return $outtext; | |
} | |
print 'Lets go...' . "\r\n"; | |
$start = microtime(); | |
if ($handle = opendir('files/')) { | |
//echo "Directory handle: $handle\n"; | |
//echo "Entries:\n"; | |
/* This is the correct way to loop over the directory. */ | |
while (false !== ($entry = readdir($handle))) { | |
if ($entry !== '.' && $entry !== '..') { | |
$content = parseWord('files/' . $entry); | |
file_put_contents('processed/' . str_replace('.doc', '.txt', $entry), $content); | |
} | |
} | |
closedir($handle); | |
} | |
$end = microtime()-$start; | |
echo 'Took ' . $end . ' seconds.' . "\r\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment