Skip to content

Instantly share code, notes, and snippets.

@hdvianna
Created April 2, 2019 19:48
Show Gist options
  • Save hdvianna/5f5d35e2e9eb012345fec0d4b96f8bcb to your computer and use it in GitHub Desktop.
Save hdvianna/5f5d35e2e9eb012345fec0d4b96f8bcb to your computer and use it in GitHub Desktop.
PHP Dummy File Comparator
<?php
define("STOP_WORDS", ["the","of","in","and","a","to","on","by","when","or","with","that","this","also","from","an","it","as","these","."]);
function iteratesWords($lineA, $lineB) {
$wordsA = preg_split("/[\s\.\,\?\;\(\)\-\%\{\}]/", $lineA);
$wordsB = preg_split("/[\s\.\,\?\;\(\)\-\%\{\}]/", $lineB);
$words = [];
foreach($wordsA as $wordA) {
foreach($wordsB as $wordB) {
if (strtolower($wordA) === strtolower($wordB)
&& !in_array(strtolower($wordA), STOP_WORDS)
&& !in_array(strtolower($wordB), STOP_WORDS)
&& strlen($wordA) > 0
&& strlen($wordB) > 0) {
if (!array_key_exists($wordA, $words)) {
$words[$wordA]['collisions'] = 0;
}
$words[$wordA]['collisions']++;
}
}
if (array_key_exists($wordA, $words)) {
$words[$wordA]['wordsa'] = count($wordsA);
$words[$wordA]['wordsb'] = count($wordsB);
}
}
return $words;
}
function iterateLines($linesA, $linesB, $treshold, $repeatedCallback) {
$lineANumber = 0;
foreach($linesA as $lineA) {
$lineANumber++;
$lineBNumber = 0;
foreach($linesB as $lineB) {
$lineBNumber++;
$repeated = iteratesWords($lineA, $lineB);
if(count(array_keys($repeated)) > 0) {
$ratioValues = array_reduce($repeated, function ($carry, $item) {
$carry['collisions'] += $item['collisions'];
$carry['words'] = min($item['wordsa'],$item['wordsb']);
return $carry;
}, ['collisions' => 0, 'words' => 0]);
$ratio = $ratioValues['collisions']/ $ratioValues['words'];
if ($ratio>= $treshold) {
$repeatedCallback($lineANumber,$lineBNumber, $repeated, $ratio);
}
}
}
}
}
$linesA = file("./fileA.tex");//Model
$linesB = file("./fileB.tex");//Mapping
iterateLines($linesA, $linesB, 0.7, function($lineANumber,$lineBNumber, $repeatedWords, $ratio) {
$words = array_keys($repeatedWords);
if (count($words) >= 10 ) {
echo "File A line $lineANumber compared to B line $lineBNumber has ".count($words)."(".(number_format($ratio*100,2))."%) words: ".implode(", ", $words).".".PHP_EOL.PHP_EOL;
}
});
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment