Skip to content

Instantly share code, notes, and snippets.

@sscarduzio
Created July 9, 2012 14:37
Show Gist options
  • Save sscarduzio/3076914 to your computer and use it in GitHub Desktop.
Save sscarduzio/3076914 to your computer and use it in GitHub Desktop.
Count word occurrences in a text file, the scalable way (suitable for HUGE files)
<?php
/**
* Counts words in a text files and outputs a chart of the most used.
*/
// Test code, run from CLI
// php WordChart.php freakingLongNovel.txt
$filename = $argv[1];
$o = new WordChart($filename);
$o->toString();
// End example code
class WordChart {
private $fp = null;
private $stats = array();
private $total = 0;
function WordChart($filename){
$this->fp = fopen($filename, "r");
if($this->fp == null){
throw new Exception("cannot open specified file: $filename", 1);
}
$this->scan();
}
private function scan(){
while (!feof($this->fp)){
// Fetch a word from stream
$word="";
do {
$char = fread($this->fp, 1);
if($this->isSeparator($char)){
break;
}
$word.=$char;
}
while(!feof($this->fp) && !$this->isSeparator($char));
// Clean word from punctuation an extra spaces
$word=preg_replace( '/[\-\_[:punct:]0-9\t\r\n\/]/', '', $word);
$word = strtolower($word);
if($word == ""){
continue;
}
// Update the chart
$this->total++;
if(array_key_exists($word, $this->stats)){
$val = $this->stats[$word];
if(is_int($val) && $val >= 0){
$this->stats[$word] += 1;
continue;
}
$this->stats[$word] = 1;
}
else {
$this->stats[$word] = 1;
}
}
fclose($this->fp);
arsort($this->stats);
}
public function toString(){
$i=1;
foreach ($this->stats as $word => $score) {
echo "$i)\t$word\t\t\t($score)\n";
$i++;
if($i>40) {
break;
}
}
echo "total words: $this->total\n";
}
private function isSeparator($char){
return (is_null($char) || $char == ' ' || $char == PHP_EOL);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment