Created
April 18, 2012 10:46
-
-
Save rodneyrehm/2412756 to your computer and use it in GitHub Desktop.
PHPGangsta: Strings in String suchen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// http://www.phpgangsta.de/algorithmus-gesucht-strings-in-string-suchen | |
function _outputStringPositionsFunction($longText, $searchTexts) { | |
// of 10000 tokens only 9143 are unique | |
$tokens = array_unique($searchTexts); | |
$positions = array(); | |
// make similar tokens appear in order | |
sort($tokens); | |
// build groups of similar tokens | |
// 9143 distinct tokens form 916 groups | |
$beginnings = array(); | |
$groups = array(); | |
$_group = null; | |
$_previous = null; | |
$_previous_length = null; | |
foreach ($tokens as $token) { | |
$_length = strlen($token); | |
if (!$_previous || $_previous_length > $_length || strncmp($token, $_previous, $_previous_length)) { | |
// start group | |
$groups[$token] = array($token); | |
$_group = $token; | |
// find beginning of group | |
$_pattern = '#' . preg_quote($token, '#') . '#'; | |
preg_match($_pattern, $longText, $_match, PREG_OFFSET_CAPTURE); | |
$beginnings[$token] = isset($_match[0][1]) ? $_match[0][1] : false; | |
} else { | |
// add to group | |
$groups[$_group][] = $token; | |
} | |
$_previous = $token; | |
$_previous_length = $_length; | |
} | |
while ($groups) { | |
$revisit = array(); | |
foreach ($groups as $_group => &$group) { | |
if ($beginnings[$_group] === false) { | |
break; | |
} | |
// start with the longest item in the group | |
rsort($group); | |
$_revisit = array(); | |
foreach ($group as $token) { | |
$_length = strlen($token); | |
if (substr($longText, $beginnings[$_group], $_length) === $token) { | |
foreach ($group as $t) { | |
$positions[$t] = $beginnings[$_group]; | |
} | |
break; | |
} else { | |
$_revisit[] = $token; | |
} | |
} | |
if ($_revisit) { | |
$_length = count($_revisit); | |
$token = $_revisit[$_length -1]; | |
$revisit[$token] = $_revisit; | |
// find beginning of group | |
$_pattern = '#' . preg_quote($token, '#') . '#'; | |
preg_match($_pattern, $longText, $_match, PREG_OFFSET_CAPTURE); | |
$beginnings[$token] = isset($_match[0][1]) ? $_match[0][1] : false; | |
} | |
} | |
$groups = $revisit; | |
} | |
// output positions in order of $searchText | |
foreach ($searchTexts as $token) { | |
if (isset($positions[$token])) { | |
echo $positions[$token]; | |
} | |
echo "\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment