Skip to content

Instantly share code, notes, and snippets.

@jandk
Created February 19, 2014 15:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jandk/9093978 to your computer and use it in GitHub Desktop.
Save jandk/9093978 to your computer and use it in GitHub Desktop.
<?php
// Startup database
require_once 'Vacature/Autoloader.php';
Vacature_Autoloader::getInstance();
$mssqlconfig = Vacature_Config::get("db.mssql");
$db = Zend_Db::factory($mssqlconfig);
if (isset($_SESSION['profiler']))
$db->getProfiler()->setEnabled(true);
// Set encoding
mb_internal_encoding('UTF-8');
set_time_limit(0);
if (!isset($_GET['table']) || !isset($_GET['field']))
die('No table or no field.');
$id = (isset($_GET['id'])) ? $_GET['id'] : 'ID';
$table = $_GET['table'];
$rawField = $_GET['field'];
$field = $db->quoteIdentifier($rawField);
$doRun = isset($_GET['run']) && $_GET['run'] === 'true';
// Get translations and statement
$translations = getTranslations('windows-1252');
$sql = $db->select()
->from($table, array(
'id' => $id,
'bin' => new Zend_Db_Expr('CAST(CAST(' . $field . 'AS NVARCHAR(MAX)) AS VARBINARY(MAX))')
))
->where($field . ' IS NOT NULL')
->where('LEN(' . $field . ') > 0')
->where(new Zend_Db_Expr($field . " LIKE '%['+ ".getLike()." +']%' COLLATE Latin1_General_BIN2"));
$statement = $sql->query();
if($doRun)
{
global $db, $statement, $translations;
header('Content-type: text/plain; charset=utf-8');
$changes = array();
while ($res = $statement->fetch(Zend_Db::FETCH_ASSOC))
{
$utf16 = binToUtf16($res['bin']);
$utf16 = translateUtf16($utf16, $translations, false);
$indices = scanUtf8($utf16);
if (count($indices) === 0)
continue;
while(count($indices))
{
decodeUtf8($utf16, $indices);
$indices = scanUtf8($utf16);
}
$utf16 = translateUtf16($utf16, $translations, true);
$new08 = utf16ToUtf8($utf16);
$changes[] = array((int)$res['id'], $new08);
}
echo 'Changes found: ' . count($changes) . PHP_EOL . PHP_EOL;
foreach($changes as $change)
{
printf('%10d: %s'.PHP_EOL, $change[0], $change[1]);
$db->update($table, array($rawField => $change[1]), $db->quoteInto($id . ' = ?', $change[0]));
}
}
else
{
global $statement, $translations;
header('Content-type: text/html; charset=utf-8');
$changes = array();
while ($res = $statement->fetch(Zend_Db::FETCH_ASSOC))
{
$utf16 = binToUtf16($res['bin']);
$old08 = utf16ToUtf8($utf16);
$utf16 = translateUtf16($utf16, $translations, false);
$indices = scanUtf8($utf16);
$lastIndices = $indices;
if (count($indices) === 0)
continue;
$oHtml = addHtml($old08, $indices);
while(count($indices))
{
decodeUtf8($utf16, $indices);
$lastIndices = $indices;
$indices = scanUtf8($utf16);
}
$utf16 = translateUtf16($utf16, $translations, true);
$new08 = utf16ToUtf8($utf16);
$nHtml = addHtml($new08, $lastIndices, true);
$changes[] = array((int)$res['id'], $oHtml, $nHtml);
}
buildTable(
array('ID', 'Old', 'New'),
$changes
);
echo '<br><br><a href="' . $_SERVER['REQUEST_URI'] . '&run=true">Run run, baby run run</a>';
}
/**
* Converts a string to hexadecimal representation.
*/
function strhex($string, $linesize = 16)
{
return implode(PHP_EOL, array_map(
function($str)
{
$split = str_split($str, 2);
return implode(' ', $split);
},
str_split(array_shift(unpack('H*', $string)), $linesize * 2)
));
}
/**
* Scans an array of integers for possible utf-8 sequences
*/
function scanUtf8(array $utf16)
{
$arrCount = count($utf16);
$indices = array();
for ($i = 0; $i < $arrCount - 1; $i++)
{
// 2 byte sequence (byte & 0b111xxxxx === 0b110xxxxx)
if (($utf16[ $i ] <= 0xff && ($utf16[ $i ] & 0xe0) === 0xc0) &&
($utf16[$i+1] <= 0xff && ($utf16[$i+1] & 0xc0) === 0x80)
)
{
$char =
($utf16[ $i ] & 0x1f) << 6 |
($utf16[$i+1] & 0x3f);
// The chars have to fit in CP1252
if ($char <= 0xff)
$indices[] = array($i, 2);
}
// 3 byte sequence (byte & 0b1111xxxx === 0b1110xxxx)
if ($i < $arrCount - 2 &&
($utf16[ $i ] <= 0xff && ($utf16[ $i ] & 0xf0) === 0xe0) &&
($utf16[$i+1] <= 0xff && ($utf16[$i+1] & 0xc0) === 0x80) &&
($utf16[$i+2] <= 0xff && ($utf16[$i+2] & 0xc0) === 0x80)
)
{
$char =
($utf16[ $i ] & 0x0f) << 12 |
($utf16[$i+1] & 0x3f) << 6 |
($utf16[$i+2] & 0x3f);
// The chars have to fit in CP1252
if ($char <= 0xff)
$indices[] = array($i, 3);
}
// We could do 4, 5 and 6 byte sequences, but they overflow UCS-2
}
return $indices;
}
function decodeUtf8(array &$utf16, array &$indices)
{
$numIndices = count($indices);
for ($i = 0; $i < $numIndices; $i++)
{
$index = $indices[$i][0];
$size = 0;
if (($utf16[$index] & 0xe0) === 0xc0)
{
$size = 1;
$char =
($utf16[$index + 0] & 0x1f) << 6 |
($utf16[$index + 1] & 0x3f);
}
if (($utf16[$index] & 0xf0) === 0xe0)
{
$size = 2;
$char =
($utf16[$index + 0] & 0x0f) << 12 |
($utf16[$index + 1] & 0x3f) << 6 |
($utf16[$index + 2] & 0x3f);
}
if ($size === 0)
continue;
// Remove the old
array_splice($utf16, $index, $size + 1, $char);
// Update offsets
for ($j = $i + 1; $j < $numIndices; $j++)
{
$indices[$j][0] -= $size;
$indices[$j][1] = 1;
}
}
}
function binToUtf16($data)
{
return array_values(unpack('S*', $data));
}
function translateUtf16(array $utf16, array $translations = array(), $reverse = false)
{
if($reverse)
$translations = array_flip($translations);
// Do translations from foreign charset
$utf16Count = count($utf16);
for ($i = 0; $i < $utf16Count; $i++)
if (array_key_exists($utf16[$i], $translations))
$utf16[$i] = $translations[$utf16[$i]];
return $utf16;
}
function utf16ToUtf8(array $utf16)
{
$utf16str = implode(array_map(function($c)
{
return pack('S', $c);
}, $utf16));
return iconv('UTF-16LE', 'UTF-8', $utf16str);
}
/**
* Gets the LIKE query for all probable utf-8 characters, with the specified charset
*/
function getLike($charset = 'windows-1252')
{
$likes = array();
for ($i = 128; $i < 256; $i++)
{
if (($i & 0xe0) === 0xc0 || ($i & 0xf0) === 0xe0)
{
$c = @array_shift(unpack('S', iconv($charset, 'UTF-16LE//IGNORE', chr($i))));
if ($c !== null)
$likes[] = "NCHAR($c)";
}
}
return implode(' + ', $likes);
}
function getTranslations($charset = 'windows-1252')
{
$translations = array();
for ($i = 128; $i < 256; $i++)
{
// TODO: Is this check needed?
// if (($i & 0xe0) === 0xc0 || ($i & 0xf0) === 0xe0)
{
$c = @array_shift(unpack('S', iconv($charset, 'UTF-16LE//IGNORE', chr($i))));
if ($c !== null && $c !== $i)
$translations[$c] = $i;
}
}
return $translations;
}
function addHtml($string, array $indices, $use1 = false)
{
$prefix = '<span style="font-weight: bold; color: red;">';
$suffix = '</span>';
$addedSize = mb_strlen($prefix) + mb_strlen($suffix);
$numIndices = count($indices);
for ($i = 0; $i < $numIndices; $i++)
{
$index = $indices[$i][0];
$size = $use1 ? 1 : $indices[$i][1];
$p1 = mb_substr($string, 0, $index);
$p2 = mb_substr($string, $index, $size);
$p3 = mb_substr($string, $index + $size);
$string = $p1 . $prefix . $p2 . $suffix . $p3;
for ($j = $i + 1; $j < $numIndices; $j++)
$indices[$j][0] += $addedSize;
}
return $string;
}
function buildTable(array $titles, array $table)
{
?>
<html>
<head>
<style type="text/css">
* {
font-family: Georgia, SERIF
}
table {
border-collapse: collapse;
}
td {
border: 1px solid gray;
padding: 5px;
}
thead td {
text-align: center;
font-weight: bold;
}
</style>
</head>
<body>
<?php
echo '<table>';
echo '<thead><tr>';
foreach ($titles as $title)
echo '<td>' . $title . '</td>';
echo '</tr></thead>';
echo '<tbody>';
foreach ($table as $row)
{
echo '<tr>';
foreach ($row as $field)
echo '<td>' . $field . '</td>' . PHP_EOL;
echo '</tr>';
}
echo '</tbody>';
echo '</table>';
?>
</body>
</html>
<?php
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment