Skip to content

Instantly share code, notes, and snippets.

@stephaneIBANEZ
Last active November 6, 2017 11:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stephaneIBANEZ/16fcb796ffdbf6e970b7215da9138e3a to your computer and use it in GitHub Desktop.
Save stephaneIBANEZ/16fcb796ffdbf6e970b7215da9138e3a to your computer and use it in GitHub Desktop.
Class Sentence: string manipulation
<?php
/*
* This file is part of the Inserm Radico package.
*
* @author Stéphane IBANEZ <stephane.ibanez@aezan.com>
*
* This class is used for manipulate strings and new methods will be added further.
*
* To use this class outof Symfony environment, delete the line namespace InsermRadicoBundle\Entity;
*/
namespace InsermRadicoBundle\Services;
use \Normalizer;
class Sentence {
const SPACE = ' ';
private $_substitutions = 0;
private $_substitutionList = array();
private $_nbWords = 0;
public function getNbWords() {
return $this->_nbWords;
}
public function getSubstitutions() {
return $this->_substitutions;
}
public function getSubstitutionList() {
return $this->_substitutionList;
}
protected function utf8_str_split($str) {
$arr = array();
$strLen = mb_strlen($str, 'UTF-8');
for ($i = 0; $i < $strLen; $i++) {
$arr[] = mb_substr($str, $i, 1, 'UTF-8');
}
return $arr;
}
public function getWords($sentence) {
return explode(self::SPACE, $sentence);
}
public function getSentence(array $words) {
return implode(self::SPACE, $words);
}
public function normalizeUtf8String($s = '', $withSqlWildCards=false) {
$s = trim($s);
$original_string = $s;
$strUniformized = "";
$substitutions = 0;
$subList = array();
$subject = $this->utf8_str_split($s);
/////////// ONE Char replacement
$patterns_x1 = "-ŋžðÞáàâäãåçéèêëíìîïóòôöõøúùûüýÿŊŽÐþÁÀÂÄÃÅÇÉÈÊËÍÌÎÏÓÒÔÖÕØÚÙÛÜÝŸ";
$replacements_x1 = " nzdtaaaaaaceeeeiiiioooooouuuuyynzDTAAAAAACEEEEIIIIOOOOOOUUUUYY";
$patterns_x1 = $this->utf8_str_split($patterns_x1);
$replacements_x1 = $this->utf8_str_split($replacements_x1);
for ($i = 0; $i < count($patterns_x1); $i++) {
$patterns_x1[$i] = "/" . $patterns_x1[$i] . "/u";
$replacements_x1[$i] = "$0," . $replacements_x1[$i] . ",1";
}
////////// TWO Chars replacement
$patterns_x2 = array('ß', 'Ñ', 'ñ', 'Ö', 'ö', 'Ü', 'ü', 'Æ', 'æ', 'IJ', 'ij', 'ÿ', 'Œ',); // 'ĸ' , 'ŀ' , 'ſ', 'ŧ');
$replacements_x2 = array('ss', 'NY', 'ny', 'OE', 'oe', 'UE', 'ue', 'AE', 'ae', 'IJ', 'ij', 'yu', 'OE'); //
for ($i = 0; $i < count($replacements_x2); $i++) {
$replacements_x2[$i] = "$0," . $replacements_x2[$i] . ",2";
$patterns_x2[$i] = "/" . $patterns_x2[$i] . "/u";
}
$result = preg_replace($patterns_x1, $replacements_x1, $subject);
for ($i = 0; $i < count($result); $i++) {
if ($subject[$i] !== $result[$i]) {
list($org, $subs, $count) = explode(',', $result[$i]);
$subject[$i] = $subs;
$substitutions += $count;
$subList[] = "$org => $subs";
}
}
$result2 = preg_replace($patterns_x2, $replacements_x2, $subject);
for ($i = 0; $i < count($result2); $i++) {
if ($subject[$i] !== $result2[$i]) {
list($org, $subs, $count) = explode(',', $result2[$i]);
$subject[$i] = $subs;
$substitutions += $count;
$subList[] = "$org => $subs";
}
}
for ($i = 0; $i < count($subject); $i++)
$strUniformized .= $subject[$i];
if($withSqlWildCards) {
$regex = '/[^a-zA-Z\s]%_/';
} else {
$regex = '/[^a-zA-Z\s]/';
}
$strUniformized = preg_replace($regex, '', $strUniformized);
$this->_substitutions = $substitutions;
$this->_substitutionList = $subList;
// to UPPERCASE
$strUniformized = trim(mb_strtoupper($strUniformized));
if ($strUniformized !== "") {
$words = explode(' ', $strUniformized);
$this->_nbWords = count($words);
}
return $strUniformized;
}
public function position(array $words) {
$nbWords = count($words);
$wordPos = array();
// $wordPos["{begin}"] = "-1";
for ($c = 0; $c < $nbWords; $c++) {
$wordPos[$words[$c]] = $c;
}
// $wordPos["{end}"] = $nbWords;
return $wordPos;
}
function sortByScore($a, $b) {
return $a['score'] < $b['score'];
}
public function words(array $positions) {
$wordPos = array();
foreach ($positions as $word => $pos) {
$wordPos[$pos] = $word;
}
return $wordPos;
}
public function getWordAt($words, $position) {
foreach($words as $word => $pos) {
if($pos === $position) return $word;
}
return '';
}
public function score($sentence, $combinations) {
$orgWords = explode(' ', $sentence);
$orgWordPos[$sentence] = $this->position($orgWords);
$end = count($orgWords);
$result = false;
foreach ($combinations as $combinaison => $string) {
// echo "<hr>";
$score = 0;
$match = 0;
$next = 0;
$prev = 0;
$destWords = explode(' ', $string);
$diff = count($orgWords) - count($destWords);
// fill array to max length
if ($diff > 0) {
$beg = count($destWords);
for ($i = $beg; $i < $end; $i++) {
$destWords[$i] = "###$i";
}
$destWordPos[$string] = $this->position($destWords);
} else {
$destWordPos[$string] = $this->position($destWords);
}
$orgWords = $this->words($orgWords);
// echo "\norg words<pre>";print_r($orgWords);
// echo "\norg org words position<pre>";print_r($orgWordPos[$sentence]);
$destWords = $this->words($destWords);
// echo "\ndest words<pre>";print_r($destWords);
// echo "\ndest words position<pre>";print_r($destWordPos[$string]);
// echo "</pre>";
foreach($destWordPos[$string] as $destWord => $position) {
// echo "<br>POS: $destWord -> $position";
if( array_key_exists($destWord, $orgWordPos[$sentence]) ) {
// echo "$string ------- $destWord ---------------- YES --- ";
$prevDestWord = $this->getWordAt($destWordPos[$string], $position - 1);
$nextDestWord = $this->getWordAt($destWordPos[$string], $position + 1);
$inPosOrgWord = $this->getWordAt($orgWordPos[$sentence], $position);
$prevOrgWord = $this->getWordAt($orgWordPos[$sentence], $position - 1);
$nextOrgWord = $this->getWordAt($orgWordPos[$sentence], $position + 1);
// echo "IN POS: $prevOrgWord PREV $prevDestWord $prevOrgWord NEXT $nextDestWord $nextOrgWord";
if($inPosOrgWord === $destWord) {
$match++;
// echo "<br>$sentence|$string INPLACE=> $inPosOrgWord === $destWord $match";
}
if($prevOrgWord === $prevDestWord && $prevOrgWord.$prevDestWord != '') {
$prev++;
// echo "<br>$sentence|$string PREV => $prevOrgWord === $prevDestWord $prev";
}
if($nextOrgWord === $nextDestWord && $nextOrgWord.$nextDestWord != '') {
$next++;
// echo "<br>$sentence|$string NEXT => $nextOrgWord === $nextDestWord $next";
}
}
$score = $match + $next + $prev;
$levenshtein = (int) levenshtein($sentence, $string, 5, 5, 5);
// var_dump($result);
} // each destWord
if($score > 0) {
// echo "<br>SCORE > 0 $score";
$result[] = array('sentence'=> $sentence, 'candidate' => $string, 'score' => $score, 'match' => $match, 'prev' => $prev, 'next' => $next, 'levenstein' => $levenshtein);
}
} // each combinations
if($result) {
usort($result, "self::sortByScore");
}
// var_dump($result);
return $result;
}
public function scoreOld($sentence, $combinations) {
$orgWords = explode(' ', $sentence);
$orgWordPos[$sentence] = $this->position($orgWords);
$end = count($orgWords);
foreach ($combinations as $combinaison => $string) {
$destWords = explode(' ', $string);
$diff = count($orgWords) - count($destWords);
// fill array to max length
if ($diff > 0) {
$beg = count($destWords);
for ($i = $beg; $i < $end; $i++) {
$destWords[$i] = "###$i";
}
$destWordPos[$string] = $this->position($destWords);
} else {
$destWordPos[$string] = $this->position($destWords);
}
foreach ($orgWordPos as $orgSentence => $orgData) {
$orgWords = $this->words($orgData);
echo "\norg words";print_r($orgWords);
foreach ($destWordPos as $destSentence => $destData) {
$score = 0;
foreach ($destData as $destWord => $destPos) {
//echo "\n\n<br>checking $destPos $destWord ";
if (array_key_exists($destWord, $orgData)) {
if ($destWord === "{begin}" || $destWord === "{end}") continue;
$next = false;
$prev = false;
$exact = false;
$orgPos = $orgData[$destWord];
$destwords = $this->words($destData);
echo "\n\nposition: $destPos\n";
echo "\ndest words";print_r($destwords);
$prevDestWord = $destwords[$destPos - 1];
$prevOrgDestWord = $orgWords[$orgPos - 1];
$nextDestWord = $destwords[$destPos + 1];
$nextOrgDestWord = $orgWords[$orgPos + 1];
$orgWord = $orgWords[$orgPos];
// echo "\n\n<br>$destPos $destWord $orgWord ----- $prevDestWord $prevOrgDestWord, $nextDestWord $nextOrgDestWord";
if ($destWord === $orgWord) {
// echo "\n<br>exta match $orgPos $destWord $orgWord";
$exact = true;
$score ++;
}
// si mot avant dest = mot avant origine score += 1
if ($prevDestWord === $prevOrgDestWord) {
//echo "<br> <font color='red'>PREV DEST === PREV ORG => SCORE + 1</font>";
// $score ++;
$prev = true;
}
// si mot apres dest = mot apres origine = 1 score += 1
if ($nextDestWord === $nextOrgDestWord) {
//echo "<br> <font color='green'>NEXT DEST === NEXT ORG=> SCORE + 1</font>";
// $score ++;
$next = true;
}
if ($next && $prev) {
// $score++;
}
if ($next || $prev) {
// $score++;
}
} else {
$score--;
}
}
}
}
$result[] = array('sentence' => $string, 'score' => $score);
}
usort($result, "self::sortByScore");
return $result;
}
public function deduplicate(array $list) {
$nbLines = count($list);
for ($l = 0; $l < $nbLines; $l++) {
$string = $this->getSentence($list[$l]);
$tmp[$string] = $list[$l];
}
foreach ($tmp as $key => $value) {
$result[] = $value;
}
return $result;
}
public function makeSentenceList(array $input) {
$nbInput = count($input);
for ($l = 0; $l < $nbInput; $l++) {
$string = $this->getSentence($input[$l]);
$result[] = $string;
}
return $result;
}
public function permuteArray(array $input) {
// FROM http://stackoverflow.com/questions/10222835/get-all-permutations-of-a-php-array
//COMMENT on this site:
//This algorithm is nice and instructive how you would do it on paper, but otherwise very
//inefficient as it calculates same permutations multiple times. Not to say that it is
//very impractical for calculating permutations of larger arrays as the space and number
//of calculations grow exponentially.
$input = array_values($input);
// permutation of 1 value is the same value
if (count($input) === 1) {
return array($input);
}
// to permute multiple values, pick a value to put in the front and
// permute the rest; repeat this with all values of the original array
$result = [];
$nbInput = count($input);
for ($i = 0; $i < $nbInput; $i++) {
$copy = $input;
$value = array_splice($copy, $i, 1);
foreach ($this->permuteArray($copy) as $permutation) {
array_unshift($permutation, $value[0]);
$result[] = $permutation;
}
}
return $result;
}
public function isoPermute($sentence) {
$words = $this->getWords($sentence);
return $this->permuteArray($words);
}
public function dropPermute($sentence) {
$result = array();
$list = $this->isoPermute($sentence);
$nbList = count($list);
for ($l = 0; $l < $nbList; $l++) {
$words = $list[$l];
$words = array_splice($words, 0, -1);
$sentence = $this->getSentence($words);
$list = array_merge($list, $this->isoPermute($sentence));
}
$result = $this->deduplicate($list);
return $result;
}
public function fullPermute($sentence) {
$result = array();
$list = $this->dropPermute($sentence);
$nbList = count($list);
for ($l = 0; $l < $nbList; $l++) {
$words = $list[$l];
$nbWords = count($words) - 1;
for ($w = $nbWords; $w > 1; $w--) {
$words = array_splice($words, 0, -1);
$sentence = $this->getSentence($words);
$list = array_merge($list, $this->dropPermute($sentence));
}
}
$result = $this->deduplicate($list);
return $result;
}
public function getDoubleChar($sentence) {
$strLen = strlen($sentence);
for($c=0; $c < $strLen; $c++) {
$altered[] = substr($sentence,0, $c) . $sentence[$c] . substr($sentence, $c, $strLen);
}
return $altered;
}
public function parseDoubleChar($sentence, $combinations) {
$sentenceLen = mb_strlen($sentence);
echo "<pre>";
print_r($combinations);
echo "</pre>";
foreach ($combinations as $combinaison => $string) {
echo "<hr>";
$match = 0;
$candidateLen = mb_strlen($string);
for ($i = 0; $i < $sentenceLen; $i++) {
if ($i < mb_strlen($candidateLen)) {
if ($string[$i] == $sentence[$i]) {
echo "<br>$i match ".$string[$i] ."==". $sentence[$i];
$match++;
}
}
}
$results[$string] = $match;
}
echo "<pre>";
print_r($results);
echo "</pre>";
}
public function getDroppedChar($sentence) {
$strLen = strlen($sentence);
for($c=0; $c < $strLen; $c++) {
$altered[] = substr($sentence,0, $c) . substr($sentence, $c+1, $strLen);
}
return $altered;
}
public function parseDropChar($sentence, $combinations) {
$sentenceLen = mb_strlen($sentence);
echo "<pre>";
print_r($combinations);
echo "</pre>";
foreach ($combinations as $combinaison => $string) {
echo "<hr>";
$match = 0;
$candidateLen = mb_strlen($string);
for ($i = 0; $i < $sentenceLen; $i++) {
if ($i < mb_strlen($candidateLen)) {
if ($string[$i] == $sentence[$i]) {
echo "<br>$i match ".$string[$i] ."==". $sentence[$i];
$match++;
}
}
}
$results[$string] = $match;
}
echo "<pre>";
print_r($results);
echo "</pre>";
}
public function lettersMatch($sentence, $combinations, $ratio=.8) {
// @todo
}
public function subSentenceMatch($sentence, $combinations, $ratio=1) {
// @todo
}
public function getMatch5on5($string, $candidate) {
$results = array();
}
public function extSearch($string, $persons, $percent = 80) {
$begin = microtime(true);
//echo "<br>$string ";
$results = array();
// print_r($persons);
foreach ($persons as $id => $person) {
$result = array(
'fMatch4_5' => "&nbsp;",
'bMatch4_5' => "&nbsp;",
'condition4_5' => false,
'fMatch5_5' => 0,
'bMatch5_5' => 0,
'condition5_5' => false,
'person' => $person
);
$matchFwd = 0;
$conditionFwd = 0;
$fullName = $person['fullNameUniformized'];
/// CONDITION 5 / 5 of included words
$fMatch5_5 = 0;
$searchWords = explode(' ', $string);
foreach ($searchWords as $id => $word) {
if ($word != '') {
$regex = "/$word/";
if (preg_match_all($regex, $fullName, $matches)) {
//echo "<br>fwd found $word in ".$person['fullName'];
$fMatch5_5++;
}
}
}
if($fMatch5_5 > 0 && ($fMatch5_5 / count($searchWords) == 1)) {
$result['fMatch5_5'] = "<font color='green'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>";
$result['condition5_5'] = true;
$result['percent'] = 100;
} else {
$result['fMatch5_5'] = "<font color='red'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>";
}
// back search
$bMatch5_5 = 0;
$searchWordsBack = explode(' ', $fullName);
foreach ($searchWordsBack as $id => $word) {
if ($word != '') {
$regex = "/$word/";
if (preg_match_all($regex, $string, $matches)) {
//echo "<br>bck found $word in $string";
$bMatch5_5++;
}
}
}
if($bMatch5_5 > 0 && ($bMatch5_5 / count($searchWordsBack) == 1)) {
$result['bMatch5_5'] = "<font color='green'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>";
$result['condition5_5'] = true;
$result['percent'] = 100;
} else {
$result['bMatch5_5'] = "<font color='red'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>";
}
// print_r($result);
// continue;
$conditionFwd = false;
$conditionBck = false;
// if(!$result['condition5_5']) {
// Forward search 4 / 5
if($searchFamilyNameUniformized == $fullName) {
$matchBack = mb_strlen($fullName);
} else {
for ($i = 0; $i < mb_strlen($searchFamilyNameUniformized); $i++) {
if ($i < mb_strlen($fullName)) {
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) {
$matchFwd++;
}
} else
break;
}
}
$countFwd = mb_strlen($fullNameUniformized);
if ($matchFwd > 0) $conditionFwd = ( $matchFwd / $countFwd ) * 100;
if ($conditionFwd >= $percent) {
$fMatch4_5 = "<font color='green'>" . $matchFwd . "/" . $countFwd . "</font>";
} else {
$fMatch4_5 = "<font color='red'>" . $matchFwd . "/" . $countFwd . "</font>";
}
///// BACKWARD SEARCH 4 / 5
$matchBack = 0;
$conditionBack = 0;
if($searchFamilyNameUniformized == $fullName) {
$matchBack = mb_strlen($fullName);
} else {
for ($i = 0; $i < mb_strlen($fullName); $i++) {
if ($i < mb_strlen($searchFamilyNameUniformized)) {
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) {
$matchBack++;
}
} else
break;
}
}
$countBack = mb_strlen($searchFullNameUniformized);
if ($matchBack > 0) $conditionBack = ($matchBack / $countBack) * 100;
if ($conditionBack >= $percent) {
$fMatch4_5 = "<font color='green'>" . $matchBack . "/" . $countBack . "</font>";
} else {
$bMatch4_5 = "<font color='red'>" . $matchBack . "/" . $countBack . "</font>";
}
$result['percent'] = ($matchBack / $countBack) * 100;
// if($conditionFwd >= $percent) echo "<br><font color='green'>$id: $string in ".$person['fullName']." match forward: $fMatch4_5 condition forward: $conditionFwd%</font>";
// if($conditionBack >= $percent) echo "<br><font color='red'>$id: ".$person['fullName']." in $string match backward: $bMatch4_5 condition backward: $conditionBack%</font>";
$result['fMatch4_5'] = $fMatch4_5;
$result['bMatch4_5'] = $bMatch4_5;
// }
// conditions check
if ($conditionFwd >= $percent && $conditionBack >= $percent) $result['condition4_5'] = true;
if ($result['condition4_5'] || $result['condition5_5']) {
$results[] = $result;
}
}
// usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent");
$end = microtime(true);
usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent");
$this->_processTime = $end - $begin;
return $results;
}
}
/********************* USAGE ************************
$string = 'j\'appelais la classe SENTENCE de Stéphane Ibáñez.12345 #?';
$sentence = new Sentence();
$sentenceUniformized = $sentence->normalizeUtf8String($string); // get uniformized sentence
$sentenceSubstitutions = $sentence->getSubstitutions(); // get number of substitutions
$sentenceSubstitutionList = $sentence->getSubstitutionList(); // get substitutions list
$sentenceNbWords = $sentence->getNbWords(); // get number of words
*/
@stephaneIBANEZ
Copy link
Author

Last release of Sentence class with all basic functions

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment