<?php | |
/* Word-wrapping code from Simon’s Quest Multilingual Retranslation project | |
* Copyright © 2020 Joel Yliluoma — https://iki.fi/bisqwit/cv2fin/ | |
*/ | |
class WordWrapState | |
{ | |
// Output state | |
var $x=0,$y=0, $outcome=''; | |
// Input state | |
var $position=0; | |
// Statistics | |
var $num_blank_lines = 0; | |
var $num_lines_beginning_with_punct = 0; | |
var $maximum_line_length = 0; // Updated at nl and at end | |
var $last_line_with_content = 0; | |
var $full_lines_with_non_punctuation = 0; | |
var $num_lines_ending_with_punct = 0; // Good thing | |
var $num_lines_ending_with_punct_and_word = 0; // Bad thing | |
var $num_fullstop_paragraphs = 0; | |
function __construct($ref = null, $pos = 0) | |
{ | |
if(isset($ref)) | |
foreach($ref as $k=>$v) | |
$this->$k = $v; | |
$this->position = $pos; | |
} | |
function Errors(&$info) | |
{ | |
// Causes of penalty: | |
// Number of lines, total | |
// Number of lines beginning with punctuation | |
// Number of blank lines | |
// Variation in line lengths | |
// Width constraint exceeded | |
// Height constraint exceeded | |
$result = 0; | |
$width = $this->maximum_line_length; | |
$height = $this->last_line_with_content + 1; | |
if($width > $info[2]) $result |= 1; | |
if($height > $info[3]) $result |= 2; | |
if($this->full_lines_with_non_punctuation) $result |= 4; | |
if($this->num_lines_beginning_with_punct) $result |= 8; | |
return $result; | |
} | |
function Score(&$info) | |
{ | |
$score = 0;//$this->position * 100000; | |
$goal = $this->IsGoal($info); | |
if($goal) | |
{ | |
$this->UpdateStatsAtNl($info, $this->position-1); | |
} | |
$errors = $this->Errors($info); | |
if($errors & 3) | |
{ | |
// Constraints exceeded | |
$score -= 50000; | |
} | |
// Make sure won't generate redundant blank lines | |
$score -= 4000 * ($this->num_blank_lines - $this->num_fullstop_paragraphs); | |
if($errors & 4) | |
{ | |
$score -= 2500 * $this->full_lines_with_non_punctuation; | |
} | |
if($errors & 8) | |
{ | |
$score -= 800 * $this->num_lines_beginning_with_punct; | |
} | |
$score -= 100 * ($this->position - $this->num_lines_ending_with_punct); | |
$score -= 90 * $this->num_lines_ending_with_punct_and_word; | |
if(1) | |
{ | |
$width = $this->maximum_line_length; | |
$nlines = 0; | |
$lines = explode($info[1], $this->outcome); | |
$prev = 0; | |
$deviation = 0; | |
foreach($lines as $line) | |
{ | |
$l = strlen($line); | |
if($l) { if($prev) { ++$nlines; $deviation += abs($prev - $l); } $prev = $l; } | |
} | |
$height = $this->last_line_with_content + 1; | |
$variation = ($width && $nlines) ? ($deviation / $nlines) / $width : 0; | |
$h = $height; | |
if($h < $info[3]) $h -= $this->num_fullstop_paragraphs; | |
#$h -= $this->num_lines_ending_with_punct; | |
$score -= 18 * max($h, 0); | |
$score -= 25 * $variation; | |
} | |
return $score; | |
} | |
function IsGoal(&$info) | |
{ | |
return $this->position >= count($info[0]); | |
} | |
function Iterate(&$info, $callback) | |
{ | |
# printf("state %.4f: {$info[0][$this->position][0]} %sf\n", $this->Score($info), json_encode($this)); | |
$p = $this->position; | |
$w = $info[0][$p][1]; | |
switch($info[0][$p][0]) | |
{ | |
// word | |
case 1: | |
{ | |
// Option 1: Append word | |
$n = new WordWrapState($this, $p+1); | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: Wordwrap, but only if previous token was punctuation | |
// This is used to wrap "aaa-bbb" into "aaa-\nbbb" when no space is present | |
if($this->x > 0 && $p >= 1 && $info[0][$p-1][0] == 4) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
} | |
break; | |
} | |
// newline | |
case 2: | |
{ | |
// Option 1: Generate newline. | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
if($this->x == 0) | |
{ | |
++$n->num_blank_lines; | |
} | |
$n->num_blank_lines += strlen($w)-1; | |
$n->x = 0; | |
$n->y += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: If the previous token was a full stop, generate two newlines. | |
if($p > 0 && $info[0][$p-1][0] == 5) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->num_blank_lines += strlen($w)*2-1; | |
$n->num_fullstop_paragraphs += strlen($w)*2-1; | |
$n->x = 0; | |
$n->y += strlen($w)*2; | |
$n->outcome .= $w.$w; | |
$callback($n); | |
} | |
break; | |
} | |
// space | |
case 3: | |
{ | |
// Option 1: Append spaces, but only if next element | |
// is not newline, and if we are not in the beginning of a line. | |
$n = new WordWrapState($this, $p+1); | |
$next = null; if(!$n->IsGoal($info)) $next = $info[0][$p+1]; | |
$prev = null; if($p > 0) $prev = $info[0][$p-1]; | |
if($this->x > 0 && isset($next) && $next[0] != 2) // next one is not a newline | |
{ | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
} | |
$callback($n); | |
// Option 2: Replace spaces with a newline, but only if either | |
// - (wishful) current line contains more than one word | |
// - (mandatory) next line is word and line length would be exceeded otherwise | |
if($this->x > 0 | |
&& ( | |
(isset($prev) && $this->x > strlen($prev[1])) | |
|| (isset($next) && $next[0] != 2 && $this->x + strlen($w) + strlen($next[1]) >= $info[2]) | |
)) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$callback($n); | |
// Option 3: Replace space with two newlines, if previous token was a full stop | |
if(isset($prev) && $prev[0] == 5) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
++$n->num_blank_lines; | |
++$n->num_fullstop_paragraphs; | |
$n->x = 0; | |
$n->y += 2; | |
$n->outcome .= $info[1] . $info[1]; // two newlines | |
$callback($n); | |
} | |
} | |
break; | |
} | |
// punct | |
case 4: | |
case 5: | |
{ | |
// Option 1: Append punctuation | |
$n = new WordWrapState($this, $p+1); | |
if($n->x == 0) | |
{ | |
++$n->num_lines_beginning_with_punct; | |
} | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: Wordwrap, but only if the line length would exceed limits otherwise | |
if($this->x > 0 && $this->x + strlen($w) > $info[2]) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
} | |
break; | |
} | |
} | |
} | |
function UpdateStatsAtNl(&$info, $p) | |
{ | |
$this->maximum_line_length = max($this->maximum_line_length, $this->x); | |
$l = 0; | |
if($p >= 0) $l = strlen($info[0][$p][1]); | |
if($this->x > 0) | |
{ | |
$this->last_line_with_content = max($this->last_line_with_content, $this->y); | |
// At full width, and the last item added was a word? | |
if($this->x == $info[2] && $info[0][$p][0] == 1) | |
{ | |
++$this->full_lines_with_non_punctuation; | |
} | |
// Add bonus for a line that ends in punctuation, | |
// unless the punctuation is at begin of line or the line contains just one word | |
if($p >= 0 && ($info[0][$p][0] == 4 || $info[0][$p][0] == 5)) | |
{ | |
if($p >= 1 && $info[0][$p-1][0] == 1 && $this->x > strlen($info[0][$p-1][1]) + $l) | |
++$this->num_lines_ending_with_punct; | |
else | |
$this->num_lines_ending_with_punct += 0.5; | |
} | |
// If a line ends with punctuation, space, and a word, | |
// add penalty that is inversely proportional to the word's length | |
// i.e. shorter words are more penalized | |
if($p >= 2 | |
&& ($info[0][$p-2][0] == 5 // punct | |
|| $info[0][$p-2][0] == 4 // punct | |
) | |
&& $info[0][$p-1][0] == 3 // space | |
&& $info[0][$p][0] == 1 // word | |
&& $this->x >= strlen($info[0][$p-2][1]) | |
+ strlen($info[0][$p-1][1]) | |
+ $l) | |
{ | |
$penalty = 1 / $l; | |
$this->num_lines_ending_with_punct_and_word += $penalty; | |
} | |
// If the line ends in a short word, add some penalty | |
if($p >= 0 | |
&& $info[0][$p][0] == 1 // word | |
) | |
{ | |
$penalty = 0.03 / ($l*$l); | |
$this->num_lines_ending_with_punct_and_word += $penalty; | |
} | |
} | |
} | |
}; | |
function Dijkstra($firststate, &$info) | |
{ | |
$queue = new SplPriorityQueue(); | |
$queue->setExtractFlags(SplPriorityQueue::EXTR_BOTH); | |
$queue->insert($firststate, 0); | |
$first = null; | |
while($queue->valid()) | |
{ | |
$top = $queue->extract(); | |
/*$cands = Array($top); | |
while($queue->valid() && $queue->top()['priority'] == $top['priority']) | |
{ | |
$cands[] = $queue->extract(); | |
} | |
foreach($cands as &$top) | |
{*/ | |
$state = $top['data']; | |
if($state->IsGoal($info)) | |
{ | |
# printf("state %.4f: * %sf\n", $state->Score($info), json_encode($state)); | |
if(!isset($first)) { $first = $state; } | |
break; | |
} | |
else | |
{ | |
$state->Iterate($info, function($newstate)use(&$queue,&$info) | |
{ | |
$queue->insert($newstate, $newstate->Score($info)); | |
}); | |
} | |
/*} | |
unset($top);*/ | |
} | |
return Array($first->outcome, $first->Errors($info)); | |
} | |
function CV2WordWrap($subject, $punct,$fullstop,$newline,$indent, $width,$height) | |
{ | |
$p = ''; for($a=0; $a<strlen($punct); ++$a) $p .= sprintf('\\x%02X', ord($punct[$a])); | |
$n = ''; for($a=0; $a<strlen($newline); ++$a) $n .= sprintf('\\x%02X', ord($newline[$a])); | |
$i = ''; for($a=0; $a<strlen($indent); ++$a) $i .= sprintf('\\x%02X', ord($indent[$a])); | |
$f = ''; for($a=0; $a<strlen($fullstop); ++$a) $f .= sprintf('\\x%02X', ord($fullstop[$a])); | |
$pattern = "/([^{$p}{$i}{$n}{$f}]+)|($n)|([{$i}]+)|([{$p}]+)|([{$f}]+)/"; | |
#print "pat($pattern)\n"; | |
preg_match_all($pattern, $subject, $mat); | |
$result = Array(); | |
$a = 0; | |
foreach($mat[0] as $k=>$v) | |
/**/if(strlen($mat[1][$k])) $result[$a++]=Array(1, $v); // word | |
elseif(strlen($mat[2][$k])) $result[$a++]=Array(2, $v); // newline | |
elseif(strlen($mat[3][$k])) $result[$a++]=Array(3, $v); // spaces | |
elseif(strlen($mat[4][$k])) $result[$a++]=Array(4, $v); // punctuation | |
elseif(strlen($mat[5][$k])) $result[$a++]=Array(5, $v); // full stop | |
elseif(strlen($v)) $result[$a++]=Array(0, $v); | |
// Remove trailing spaces, if any | |
for(;;) | |
{ | |
$n = count($result); | |
if(!$n) break; | |
if($result[$n-1][0] != 3) break; | |
unset($result[$n-1]); | |
} | |
#foreach($result as $p) | |
# printf("%d %s\n", $p[0], TranslateDialogTranslated($p[1])); | |
$state = new WordWrapState; | |
$info = Array($result,$newline,$width,$height, "{$p}{$f}"); | |
$result = Dijkstra($state, $info); | |
return $result; | |
} | |
?><?php | |
////////////////////////////////////////////// | |
// Test code: | |
function TranslateDialogTranslated($s) { return str_replace("\n",'[nl]',$s); } | |
//require 'inc/wordwrap.php'; | |
$subject = "[nl]If you plan to trek through a swamp, eat laurels. It neutralizes the poison."; | |
$subject = str_replace('[nl]', "\n", $subject); | |
$punct = "-,;"; | |
$punct2 = ".:!?"; | |
$newline = "\n"; | |
$indent = " "; | |
$result = CV2WordWrap($subject, $punct,$punct2, $newline,$indent, 23,6); | |
printf("Errors (bitmask): %d\n", $result[1]); | |
print TranslateDialogTranslated($result[0]); | |
print "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment