Skip to content

Instantly share code, notes, and snippets.

@bisqwit

bisqwit/wordwrap.php

Created Jul 15, 2020
Embed
What would you like to do?
<?php
/* Word-wrapping code from Simon’s Quest Multilingual Retranslation project
* Copyright © 2020 Joel Yliluoma — https://iki.fi/bisqwit/cv2fin/
*/
class WordWrapState
{
// Output state
var $x=0,$y=0, $outcome='';
// Input state
var $position=0;
// Statistics
var $num_blank_lines = 0;
var $num_lines_beginning_with_punct = 0;
var $maximum_line_length = 0; // Updated at nl and at end
var $last_line_with_content = 0;
var $full_lines_with_non_punctuation = 0;
var $num_lines_ending_with_punct = 0; // Good thing
var $num_lines_ending_with_punct_and_word = 0; // Bad thing
var $num_fullstop_paragraphs = 0;
function __construct($ref = null, $pos = 0)
{
if(isset($ref))
foreach($ref as $k=>$v)
$this->$k = $v;
$this->position = $pos;
}
function Errors(&$info)
{
// Causes of penalty:
// Number of lines, total
// Number of lines beginning with punctuation
// Number of blank lines
// Variation in line lengths
// Width constraint exceeded
// Height constraint exceeded
$result = 0;
$width = $this->maximum_line_length;
$height = $this->last_line_with_content + 1;
if($width > $info[2]) $result |= 1;
if($height > $info[3]) $result |= 2;
if($this->full_lines_with_non_punctuation) $result |= 4;
if($this->num_lines_beginning_with_punct) $result |= 8;
return $result;
}
function Score(&$info)
{
$score = 0;//$this->position * 100000;
$goal = $this->IsGoal($info);
if($goal)
{
$this->UpdateStatsAtNl($info, $this->position-1);
}
$errors = $this->Errors($info);
if($errors & 3)
{
// Constraints exceeded
$score -= 50000;
}
// Make sure won't generate redundant blank lines
$score -= 4000 * ($this->num_blank_lines - $this->num_fullstop_paragraphs);
if($errors & 4)
{
$score -= 2500 * $this->full_lines_with_non_punctuation;
}
if($errors & 8)
{
$score -= 800 * $this->num_lines_beginning_with_punct;
}
$score -= 100 * ($this->position - $this->num_lines_ending_with_punct);
$score -= 90 * $this->num_lines_ending_with_punct_and_word;
if(1)
{
$width = $this->maximum_line_length;
$nlines = 0;
$lines = explode($info[1], $this->outcome);
$prev = 0;
$deviation = 0;
foreach($lines as $line)
{
$l = strlen($line);
if($l) { if($prev) { ++$nlines; $deviation += abs($prev - $l); } $prev = $l; }
}
$height = $this->last_line_with_content + 1;
$variation = ($width && $nlines) ? ($deviation / $nlines) / $width : 0;
$h = $height;
if($h < $info[3]) $h -= $this->num_fullstop_paragraphs;
#$h -= $this->num_lines_ending_with_punct;
$score -= 18 * max($h, 0);
$score -= 25 * $variation;
}
return $score;
}
function IsGoal(&$info)
{
return $this->position >= count($info[0]);
}
function Iterate(&$info, $callback)
{
# printf("state %.4f: {$info[0][$this->position][0]} %sf\n", $this->Score($info), json_encode($this));
$p = $this->position;
$w = $info[0][$p][1];
switch($info[0][$p][0])
{
// word
case 1:
{
// Option 1: Append word
$n = new WordWrapState($this, $p+1);
$n->x += strlen($w);
$n->outcome .= $w;
$callback($n);
// Option 2: Wordwrap, but only if previous token was punctuation
// This is used to wrap "aaa-bbb" into "aaa-\nbbb" when no space is present
if($this->x > 0 && $p >= 1 && $info[0][$p-1][0] == 4)
{
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
$n->x = 0;
++$n->y;
$n->outcome .= $info[1]; // newline
$n->x += strlen($w);
$n->outcome .= $w;
$callback($n);
}
break;
}
// newline
case 2:
{
// Option 1: Generate newline.
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
if($this->x == 0)
{
++$n->num_blank_lines;
}
$n->num_blank_lines += strlen($w)-1;
$n->x = 0;
$n->y += strlen($w);
$n->outcome .= $w;
$callback($n);
// Option 2: If the previous token was a full stop, generate two newlines.
if($p > 0 && $info[0][$p-1][0] == 5)
{
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
$n->num_blank_lines += strlen($w)*2-1;
$n->num_fullstop_paragraphs += strlen($w)*2-1;
$n->x = 0;
$n->y += strlen($w)*2;
$n->outcome .= $w.$w;
$callback($n);
}
break;
}
// space
case 3:
{
// Option 1: Append spaces, but only if next element
// is not newline, and if we are not in the beginning of a line.
$n = new WordWrapState($this, $p+1);
$next = null; if(!$n->IsGoal($info)) $next = $info[0][$p+1];
$prev = null; if($p > 0) $prev = $info[0][$p-1];
if($this->x > 0 && isset($next) && $next[0] != 2) // next one is not a newline
{
$n->x += strlen($w);
$n->outcome .= $w;
}
$callback($n);
// Option 2: Replace spaces with a newline, but only if either
// - (wishful) current line contains more than one word
// - (mandatory) next line is word and line length would be exceeded otherwise
if($this->x > 0
&& (
(isset($prev) && $this->x > strlen($prev[1]))
|| (isset($next) && $next[0] != 2 && $this->x + strlen($w) + strlen($next[1]) >= $info[2])
))
{
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
$n->x = 0;
++$n->y;
$n->outcome .= $info[1]; // newline
$callback($n);
// Option 3: Replace space with two newlines, if previous token was a full stop
if(isset($prev) && $prev[0] == 5)
{
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
++$n->num_blank_lines;
++$n->num_fullstop_paragraphs;
$n->x = 0;
$n->y += 2;
$n->outcome .= $info[1] . $info[1]; // two newlines
$callback($n);
}
}
break;
}
// punct
case 4:
case 5:
{
// Option 1: Append punctuation
$n = new WordWrapState($this, $p+1);
if($n->x == 0)
{
++$n->num_lines_beginning_with_punct;
}
$n->x += strlen($w);
$n->outcome .= $w;
$callback($n);
// Option 2: Wordwrap, but only if the line length would exceed limits otherwise
if($this->x > 0 && $this->x + strlen($w) > $info[2])
{
$n = new WordWrapState($this, $p+1);
$n->UpdateStatsAtNl($info, $p-1);
$n->x = 0;
++$n->y;
$n->outcome .= $info[1]; // newline
$n->x += strlen($w);
$n->outcome .= $w;
$callback($n);
}
break;
}
}
}
function UpdateStatsAtNl(&$info, $p)
{
$this->maximum_line_length = max($this->maximum_line_length, $this->x);
$l = 0;
if($p >= 0) $l = strlen($info[0][$p][1]);
if($this->x > 0)
{
$this->last_line_with_content = max($this->last_line_with_content, $this->y);
// At full width, and the last item added was a word?
if($this->x == $info[2] && $info[0][$p][0] == 1)
{
++$this->full_lines_with_non_punctuation;
}
// Add bonus for a line that ends in punctuation,
// unless the punctuation is at begin of line or the line contains just one word
if($p >= 0 && ($info[0][$p][0] == 4 || $info[0][$p][0] == 5))
{
if($p >= 1 && $info[0][$p-1][0] == 1 && $this->x > strlen($info[0][$p-1][1]) + $l)
++$this->num_lines_ending_with_punct;
else
$this->num_lines_ending_with_punct += 0.5;
}
// If a line ends with punctuation, space, and a word,
// add penalty that is inversely proportional to the word's length
// i.e. shorter words are more penalized
if($p >= 2
&& ($info[0][$p-2][0] == 5 // punct
|| $info[0][$p-2][0] == 4 // punct
)
&& $info[0][$p-1][0] == 3 // space
&& $info[0][$p][0] == 1 // word
&& $this->x >= strlen($info[0][$p-2][1])
+ strlen($info[0][$p-1][1])
+ $l)
{
$penalty = 1 / $l;
$this->num_lines_ending_with_punct_and_word += $penalty;
}
// If the line ends in a short word, add some penalty
if($p >= 0
&& $info[0][$p][0] == 1 // word
)
{
$penalty = 0.03 / ($l*$l);
$this->num_lines_ending_with_punct_and_word += $penalty;
}
}
}
};
function Dijkstra($firststate, &$info)
{
$queue = new SplPriorityQueue();
$queue->setExtractFlags(SplPriorityQueue::EXTR_BOTH);
$queue->insert($firststate, 0);
$first = null;
while($queue->valid())
{
$top = $queue->extract();
/*$cands = Array($top);
while($queue->valid() && $queue->top()['priority'] == $top['priority'])
{
$cands[] = $queue->extract();
}
foreach($cands as &$top)
{*/
$state = $top['data'];
if($state->IsGoal($info))
{
# printf("state %.4f: * %sf\n", $state->Score($info), json_encode($state));
if(!isset($first)) { $first = $state; }
break;
}
else
{
$state->Iterate($info, function($newstate)use(&$queue,&$info)
{
$queue->insert($newstate, $newstate->Score($info));
});
}
/*}
unset($top);*/
}
return Array($first->outcome, $first->Errors($info));
}
function CV2WordWrap($subject, $punct,$fullstop,$newline,$indent, $width,$height)
{
$p = ''; for($a=0; $a<strlen($punct); ++$a) $p .= sprintf('\\x%02X', ord($punct[$a]));
$n = ''; for($a=0; $a<strlen($newline); ++$a) $n .= sprintf('\\x%02X', ord($newline[$a]));
$i = ''; for($a=0; $a<strlen($indent); ++$a) $i .= sprintf('\\x%02X', ord($indent[$a]));
$f = ''; for($a=0; $a<strlen($fullstop); ++$a) $f .= sprintf('\\x%02X', ord($fullstop[$a]));
$pattern = "/([^{$p}{$i}{$n}{$f}]+)|($n)|([{$i}]+)|([{$p}]+)|([{$f}]+)/";
#print "pat($pattern)\n";
preg_match_all($pattern, $subject, $mat);
$result = Array();
$a = 0;
foreach($mat[0] as $k=>$v)
/**/if(strlen($mat[1][$k])) $result[$a++]=Array(1, $v); // word
elseif(strlen($mat[2][$k])) $result[$a++]=Array(2, $v); // newline
elseif(strlen($mat[3][$k])) $result[$a++]=Array(3, $v); // spaces
elseif(strlen($mat[4][$k])) $result[$a++]=Array(4, $v); // punctuation
elseif(strlen($mat[5][$k])) $result[$a++]=Array(5, $v); // full stop
elseif(strlen($v)) $result[$a++]=Array(0, $v);
// Remove trailing spaces, if any
for(;;)
{
$n = count($result);
if(!$n) break;
if($result[$n-1][0] != 3) break;
unset($result[$n-1]);
}
#foreach($result as $p)
# printf("%d %s\n", $p[0], TranslateDialogTranslated($p[1]));
$state = new WordWrapState;
$info = Array($result,$newline,$width,$height, "{$p}{$f}");
$result = Dijkstra($state, $info);
return $result;
}
?><?php
//////////////////////////////////////////////
// Test code:
function TranslateDialogTranslated($s) { return str_replace("\n",'[nl]',$s); }
//require 'inc/wordwrap.php';
$subject = "[nl]If you plan to trek through a swamp, eat laurels. It neutralizes the poison.";
$subject = str_replace('[nl]', "\n", $subject);
$punct = "-,;";
$punct2 = ".:!?";
$newline = "\n";
$indent = " ";
$result = CV2WordWrap($subject, $punct,$punct2, $newline,$indent, 23,6);
printf("Errors (bitmask): %d\n", $result[1]);
print TranslateDialogTranslated($result[0]);
print "\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.