Skip to content

Instantly share code, notes, and snippets.

@juriad juriad/excerpt.php

Created Mar 30, 2016
Embed
What would you like to do?
Zvýraznění hledaného výrazu v textu s ignorováním velikosti písma a diakritiky; v PHP
<?php
class Excerpt {
# transformation rules for characters common in Central Europe
# at the end there are all [:space:] characters transforrmed to a single space
private static $transformation = [
'ä'=>'a',
'Ä'=>'A',
'á'=>'a',
'Á'=>'A',
'à'=>'a',
'À'=>'A',
'ã'=>'a',
'Ã'=>'A',
'â'=>'a',
'Â'=>'A',
'č'=>'c',
'Č'=>'C',
'ć'=>'c',
'Ć'=>'C',
'ď'=>'d',
'Ď'=>'D',
'ě'=>'e',
'Ě'=>'E',
'é'=>'e',
'É'=>'E',
'ë'=>'e',
'Ë'=>'E',
'è'=>'e',
'È'=>'E',
'ê'=>'e',
'Ê'=>'E',
'í'=>'i',
'Í'=>'I',
'ï'=>'i',
'Ï'=>'I',
'ì'=>'i',
'Ì'=>'I',
'î'=>'i',
'Î'=>'I',
'ľ'=>'l',
'Ľ'=>'L',
'ĺ'=>'l',
'Ĺ'=>'L',
'ń'=>'n',
'Ń'=>'N',
'ň'=>'n',
'Ň'=>'N',
'ñ'=>'n',
'Ñ'=>'N',
'ó'=>'o',
'Ó'=>'O',
'ö'=>'o',
'Ö'=>'O',
'ô'=>'o',
'Ô'=>'O',
'ò'=>'o',
'Ò'=>'O',
'õ'=>'o',
'Õ'=>'O',
'ő'=>'o',
'Ő'=>'O',
'ř'=>'r',
'Ř'=>'R',
'ŕ'=>'r',
'Ŕ'=>'R',
'š'=>'s',
'Š'=>'S',
'ś'=>'s',
'Ś'=>'S',
'ť'=>'t',
'Ť'=>'T',
'ú'=>'u',
'Ú'=>'U',
'ů'=>'u',
'Ů'=>'U',
'ü'=>'u',
'Ü'=>'U',
'ù'=>'u',
'Ù'=>'U',
'ũ'=>'u',
'Ũ'=>'U',
'û'=>'u',
'Û'=>'U',
'ý'=>'y',
'Ý'=>'Y',
'ž'=>'z',
'Ž'=>'Z',
'ź'=>'z',
'Ź'=>'Z',
"\x9"=>' ',
"\xa"=>' ',
"\xb"=>' ',
"\xc"=>' ',
"\xd"=>' '
];
# original text
private $text;
# text without diacritic marks
private $textAscii;
# length of the text in characters
private $textLength;
# array of [start and end] positions of runs of spaces
private $textSpaces;
# original search
private $search;
# search without diacritic marks
private $searchAscii;
# length of the search in characters
private $searchLength;
# array of positions of searchAscii inside textAscii (ignoring cases)
private $searchPositions;
public function __construct($text, $search) {
$this->text = $text;
$this->textAscii = self::toAscii($text);
$this->textLength = mb_strlen($text);
$this->findSpaces();
$this->search = $search;
$this->searchAscii = self::toAscii($search);
$this->searchLength = mb_strlen($search);
$this->findPositions();
}
# removes diacritic marks from text and replaces all blanks with a single space
private static function toAscii($str) {
return strtr($str, self::$transformation);
}
# sets positions of searchAscii inside textAscii
private function findPositions() {
$this->searchPositions = [];
$pos = 0;
while (($pos = mb_stripos($this->textAscii, $this->searchAscii, $pos)) !== FALSE) {
$this->searchPositions[] = $pos;
$pos += $this->searchLength;
}
}
# sets start and end positions of runs of spaces in textAscii
# it makes sure that the beginning and the end of the textAscii are considered a space
private function findSpaces() {
# find all spaces
$spaces = [];
$pos = 0;
while (($pos = mb_strpos($this->textAscii, ' ', $pos)) !== FALSE) {
$spaces[] = $pos;
$pos++;
}
# now we shall merge runs of spaces into a single record
# and deal with the beginning and the end
$this->textSpaces = [];
if (count($spaces) == 0 || $spaces[0] > 0) {
# add beginning
$this->textSpaces[] = [0, 0];
}
for ($i = 0; $i < count($spaces);) {
# process middle
$s = $i;
for ($i++; $i < count($spaces) && $spaces[$i] - $spaces[$s] == $i - $s; $i++);
$this->textSpaces[] = [$spaces[$s], $spaces[$i - 1] + 1];
}
if (count($spaces) == 0 ? $this->textLength > 0 : $spaces[count($spaces) - 1] < $this->textLength - 1) {
# add end
$this->textSpaces[] = [$this->textLength, $this->textLength];
}
}
# find the best region for the excerpt depending on where the search positions are
# and depending on the length of the excerpt
private function findBestPart($length) {
if (count($this->searchPositions) == 0) {
# couldn't find $search anywhere
# let's assume part of the text from the beginning
return [0, 0];
}
# let's find longest subsequence (of limited length) which contains most occurences
$bestFrom = 0;
$bestTo = -1;
$from = 0; # $from is tail, $to is head
for ($to = 0; $to < count($this->searchPositions); $to++) {
# moving the tail to keep the body <= $length
while ($this->searchPositions[$to] - $this->searchPositions[$from] + $this->searchLength > $length) {
$from++;
}
# does the body contain more occurences?
if ($to - $from > $bestTo - $bestFrom) {
$bestFrom = $from;
$bestTo = $to;
}
}
return [
$this->searchPositions[$bestFrom],
$this->searchPositions[$bestTo] + $this->searchLength
];
}
# alternate extending the region left and right until it exceed the $length or run out of text
private function extendExcerpt($start, $end, $length) {
# first we find the surrounding spaces for the given region
$startIndex = 0;
$endIndex = NULL;
foreach ($this->textSpaces as $index => $space) {
if ($space[1] <= $start) {
$startIndex = $index;
}
if ($endIndex == NULL && $space[0] >= $end) {
$endIndex = $index;
}
}
# now we know which space is before and which is after
# let's extend the text until it excides $length
$changes = 1;
# until the region exceeds $length or it stops changing
while ($this->textSpaces[$endIndex][0] - $this->textSpaces[$startIndex][1] < $length && $changes > 0) {
$changes = 0;
if ($startIndex > 0) {
$startIndex--;
$changes++;
}
if ($endIndex < count($this->textSpaces) - 1) {
$endIndex++;
$changes++;
}
}
# now we have the correct limits, which can be slightly bigger than $length
return [$this->textSpaces[$startIndex][1], $this->textSpaces[$endIndex][0]];
}
# finds the region or full words which contains most occurences of search
# or the beginning of the text if none were found
# the excerpt is limited by the $length, which it is allowed to exceed.
private function findExcerpt($length) {
list($start, $end) = $this->findBestPart($length);
return $this->extendExcerpt($start, $end, $length);
}
public function extract($length = 150, $wrapFunc = NULL) {
list($start, $end) = $this->findExcerpt($length);
$excerpt = mb_substr($this->text, $start, $end - $start);
# if no wrapFunc was given, just return the excerpt
if (!is_callable($wrapFunc)) {
return $excerpt;
}
# otherwise split the excerpt into parts
# the wrapFunc is applied to each occurence of the search
# the rest is kept unchanged
$parts = [];
$position = $start;
foreach ($this->searchPositions as $searchPosition) {
if ($searchPosition >= $position && $searchPosition + $this->searchLength <= $end) {
if ($searchPosition > $position) {
# part between searches or at the beginning of the excerpt
$parts[] = mb_substr($excerpt, $position - $start, $searchPosition - $position);
}
# occurence of the search
$search = mb_substr($excerpt, $searchPosition - $start, $this->searchLength);
$parts[] = call_user_func($wrapFunc, $search);
$position = $searchPosition + $this->searchLength;
}
}
if ($position < $end) {
# part at the end of the excerpt
$parts[] = mb_substr($excerpt, $position - $start, $end - $position);
}
# all parts are glued together
return implode("", $parts);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.