Skip to content

Instantly share code, notes, and snippets.

@rodneyrehm
Created April 18, 2012 10:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rodneyrehm/2412756 to your computer and use it in GitHub Desktop.
Save rodneyrehm/2412756 to your computer and use it in GitHub Desktop.
PHPGangsta: Strings in String suchen
<?php
// http://www.phpgangsta.de/algorithmus-gesucht-strings-in-string-suchen
function _outputStringPositionsFunction($longText, $searchTexts) {
// of 10000 tokens only 9143 are unique
$tokens = array_unique($searchTexts);
$positions = array();
// make similar tokens appear in order
sort($tokens);
// build groups of similar tokens
// 9143 distinct tokens form 916 groups
$beginnings = array();
$groups = array();
$_group = null;
$_previous = null;
$_previous_length = null;
foreach ($tokens as $token) {
$_length = strlen($token);
if (!$_previous || $_previous_length > $_length || strncmp($token, $_previous, $_previous_length)) {
// start group
$groups[$token] = array($token);
$_group = $token;
// find beginning of group
$_pattern = '#' . preg_quote($token, '#') . '#';
preg_match($_pattern, $longText, $_match, PREG_OFFSET_CAPTURE);
$beginnings[$token] = isset($_match[0][1]) ? $_match[0][1] : false;
} else {
// add to group
$groups[$_group][] = $token;
}
$_previous = $token;
$_previous_length = $_length;
}
while ($groups) {
$revisit = array();
foreach ($groups as $_group => &$group) {
if ($beginnings[$_group] === false) {
break;
}
// start with the longest item in the group
rsort($group);
$_revisit = array();
foreach ($group as $token) {
$_length = strlen($token);
if (substr($longText, $beginnings[$_group], $_length) === $token) {
foreach ($group as $t) {
$positions[$t] = $beginnings[$_group];
}
break;
} else {
$_revisit[] = $token;
}
}
if ($_revisit) {
$_length = count($_revisit);
$token = $_revisit[$_length -1];
$revisit[$token] = $_revisit;
// find beginning of group
$_pattern = '#' . preg_quote($token, '#') . '#';
preg_match($_pattern, $longText, $_match, PREG_OFFSET_CAPTURE);
$beginnings[$token] = isset($_match[0][1]) ? $_match[0][1] : false;
}
}
$groups = $revisit;
}
// output positions in order of $searchText
foreach ($searchTexts as $token) {
if (isset($positions[$token])) {
echo $positions[$token];
}
echo "\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment