Created
July 9, 2012 14:37
-
-
Save sscarduzio/3076914 to your computer and use it in GitHub Desktop.
Count word occurrences in a text file, the scalable way (suitable for HUGE files)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Counts words in a text files and outputs a chart of the most used. | |
*/ | |
// Test code, run from CLI | |
// php WordChart.php freakingLongNovel.txt | |
$filename = $argv[1]; | |
$o = new WordChart($filename); | |
$o->toString(); | |
// End example code | |
class WordChart { | |
private $fp = null; | |
private $stats = array(); | |
private $total = 0; | |
function WordChart($filename){ | |
$this->fp = fopen($filename, "r"); | |
if($this->fp == null){ | |
throw new Exception("cannot open specified file: $filename", 1); | |
} | |
$this->scan(); | |
} | |
private function scan(){ | |
while (!feof($this->fp)){ | |
// Fetch a word from stream | |
$word=""; | |
do { | |
$char = fread($this->fp, 1); | |
if($this->isSeparator($char)){ | |
break; | |
} | |
$word.=$char; | |
} | |
while(!feof($this->fp) && !$this->isSeparator($char)); | |
// Clean word from punctuation an extra spaces | |
$word=preg_replace( '/[\-\_[:punct:]0-9\t\r\n\/]/', '', $word); | |
$word = strtolower($word); | |
if($word == ""){ | |
continue; | |
} | |
// Update the chart | |
$this->total++; | |
if(array_key_exists($word, $this->stats)){ | |
$val = $this->stats[$word]; | |
if(is_int($val) && $val >= 0){ | |
$this->stats[$word] += 1; | |
continue; | |
} | |
$this->stats[$word] = 1; | |
} | |
else { | |
$this->stats[$word] = 1; | |
} | |
} | |
fclose($this->fp); | |
arsort($this->stats); | |
} | |
public function toString(){ | |
$i=1; | |
foreach ($this->stats as $word => $score) { | |
echo "$i)\t$word\t\t\t($score)\n"; | |
$i++; | |
if($i>40) { | |
break; | |
} | |
} | |
echo "total words: $this->total\n"; | |
} | |
private function isSeparator($char){ | |
return (is_null($char) || $char == ' ' || $char == PHP_EOL); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment