Skip to content

Instantly share code, notes, and snippets.

@melchisedech333
Created April 10, 2022 23:49
Show Gist options
  • Save melchisedech333/c75a38d1c75a5391ce496999ec29795a to your computer and use it in GitHub Desktop.
Save melchisedech333/c75a38d1c75a5391ce496999ec29795a to your computer and use it in GitHub Desktop.
Ranking words of file texts
<?php
// Read file contents.
$path = 'texts-youtube/';
$content = '';
if ($handle = opendir($path)) {
while (false !== ($entry = readdir($handle)))
if ($entry != "." && $entry != "..")
$content .= file_get_contents($path . $entry);
closedir($handle);
}
// Prepare words.
function check_letter ($ch) {
$letters = array(
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
);
for ($a=0; $a<count($letters); $a++)
if ($letters[$a] == $ch)
return true;
return false;
}
$words = explode(" ", $content);
$newWords = array();
for ($a=0; $a<count($words); $a++) {
$word = trim($words[$a]);
$word = strtolower($word);
$tmp = '';
for ($b=0; $b<strlen($word); $b++) {
if (check_letter($word[$b]) === true)
$tmp .= $word[$b];
}
$word = trim($tmp);
if (!empty($word) && strlen($word) >= 2)
$newWords []= $word;
}
// Remove duplicate.
$words = $newWords;
$wordlist = array();
$total = count($words);
echo "Remove duplicate...\n";
echo "Current total: ". $total ."\n";
for ($a=0; $a<count($words); $a++) {
$found = false;
$index = -1;
for ($b=0; $b<count($wordlist); $b++) {
if ($words[$a] == $wordlist[$b]['word']) {
$found = true;
$index = $b;
break;
}
}
if ($found === false)
$wordlist []= array(
'word' => $words[$a],
'total' => 1
);
else
$wordlist[ $index ]['total']++;
echo "\rProcess: ". $a ." / ". $total;
}
echo "\n";
// Order list.
$words = $wordlist;
$total = array_column($words, 'total');
array_multisort($total, SORT_DESC, $words);
for ($a=0; $a<count($words); $a++)
file_put_contents('words.txt', $words[$a]['total'] .' - '. $words[$a]['word'] ."\n", FILE_APPEND);
echo "Finished!\n\n\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment