Created
February 19, 2014 15:10
-
-
Save jandk/9093978 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Startup database | |
require_once 'Vacature/Autoloader.php'; | |
Vacature_Autoloader::getInstance(); | |
$mssqlconfig = Vacature_Config::get("db.mssql"); | |
$db = Zend_Db::factory($mssqlconfig); | |
if (isset($_SESSION['profiler'])) | |
$db->getProfiler()->setEnabled(true); | |
// Set encoding | |
mb_internal_encoding('UTF-8'); | |
set_time_limit(0); | |
if (!isset($_GET['table']) || !isset($_GET['field'])) | |
die('No table or no field.'); | |
$id = (isset($_GET['id'])) ? $_GET['id'] : 'ID'; | |
$table = $_GET['table']; | |
$rawField = $_GET['field']; | |
$field = $db->quoteIdentifier($rawField); | |
$doRun = isset($_GET['run']) && $_GET['run'] === 'true'; | |
// Get translations and statement | |
$translations = getTranslations('windows-1252'); | |
$sql = $db->select() | |
->from($table, array( | |
'id' => $id, | |
'bin' => new Zend_Db_Expr('CAST(CAST(' . $field . 'AS NVARCHAR(MAX)) AS VARBINARY(MAX))') | |
)) | |
->where($field . ' IS NOT NULL') | |
->where('LEN(' . $field . ') > 0') | |
->where(new Zend_Db_Expr($field . " LIKE '%['+ ".getLike()." +']%' COLLATE Latin1_General_BIN2")); | |
$statement = $sql->query(); | |
if($doRun) | |
{ | |
global $db, $statement, $translations; | |
header('Content-type: text/plain; charset=utf-8'); | |
$changes = array(); | |
while ($res = $statement->fetch(Zend_Db::FETCH_ASSOC)) | |
{ | |
$utf16 = binToUtf16($res['bin']); | |
$utf16 = translateUtf16($utf16, $translations, false); | |
$indices = scanUtf8($utf16); | |
if (count($indices) === 0) | |
continue; | |
while(count($indices)) | |
{ | |
decodeUtf8($utf16, $indices); | |
$indices = scanUtf8($utf16); | |
} | |
$utf16 = translateUtf16($utf16, $translations, true); | |
$new08 = utf16ToUtf8($utf16); | |
$changes[] = array((int)$res['id'], $new08); | |
} | |
echo 'Changes found: ' . count($changes) . PHP_EOL . PHP_EOL; | |
foreach($changes as $change) | |
{ | |
printf('%10d: %s'.PHP_EOL, $change[0], $change[1]); | |
$db->update($table, array($rawField => $change[1]), $db->quoteInto($id . ' = ?', $change[0])); | |
} | |
} | |
else | |
{ | |
global $statement, $translations; | |
header('Content-type: text/html; charset=utf-8'); | |
$changes = array(); | |
while ($res = $statement->fetch(Zend_Db::FETCH_ASSOC)) | |
{ | |
$utf16 = binToUtf16($res['bin']); | |
$old08 = utf16ToUtf8($utf16); | |
$utf16 = translateUtf16($utf16, $translations, false); | |
$indices = scanUtf8($utf16); | |
$lastIndices = $indices; | |
if (count($indices) === 0) | |
continue; | |
$oHtml = addHtml($old08, $indices); | |
while(count($indices)) | |
{ | |
decodeUtf8($utf16, $indices); | |
$lastIndices = $indices; | |
$indices = scanUtf8($utf16); | |
} | |
$utf16 = translateUtf16($utf16, $translations, true); | |
$new08 = utf16ToUtf8($utf16); | |
$nHtml = addHtml($new08, $lastIndices, true); | |
$changes[] = array((int)$res['id'], $oHtml, $nHtml); | |
} | |
buildTable( | |
array('ID', 'Old', 'New'), | |
$changes | |
); | |
echo '<br><br><a href="' . $_SERVER['REQUEST_URI'] . '&run=true">Run run, baby run run</a>'; | |
} | |
/** | |
* Converts a string to hexadecimal representation. | |
*/ | |
function strhex($string, $linesize = 16) | |
{ | |
return implode(PHP_EOL, array_map( | |
function($str) | |
{ | |
$split = str_split($str, 2); | |
return implode(' ', $split); | |
}, | |
str_split(array_shift(unpack('H*', $string)), $linesize * 2) | |
)); | |
} | |
/** | |
* Scans an array of integers for possible utf-8 sequences | |
*/ | |
function scanUtf8(array $utf16) | |
{ | |
$arrCount = count($utf16); | |
$indices = array(); | |
for ($i = 0; $i < $arrCount - 1; $i++) | |
{ | |
// 2 byte sequence (byte & 0b111xxxxx === 0b110xxxxx) | |
if (($utf16[ $i ] <= 0xff && ($utf16[ $i ] & 0xe0) === 0xc0) && | |
($utf16[$i+1] <= 0xff && ($utf16[$i+1] & 0xc0) === 0x80) | |
) | |
{ | |
$char = | |
($utf16[ $i ] & 0x1f) << 6 | | |
($utf16[$i+1] & 0x3f); | |
// The chars have to fit in CP1252 | |
if ($char <= 0xff) | |
$indices[] = array($i, 2); | |
} | |
// 3 byte sequence (byte & 0b1111xxxx === 0b1110xxxx) | |
if ($i < $arrCount - 2 && | |
($utf16[ $i ] <= 0xff && ($utf16[ $i ] & 0xf0) === 0xe0) && | |
($utf16[$i+1] <= 0xff && ($utf16[$i+1] & 0xc0) === 0x80) && | |
($utf16[$i+2] <= 0xff && ($utf16[$i+2] & 0xc0) === 0x80) | |
) | |
{ | |
$char = | |
($utf16[ $i ] & 0x0f) << 12 | | |
($utf16[$i+1] & 0x3f) << 6 | | |
($utf16[$i+2] & 0x3f); | |
// The chars have to fit in CP1252 | |
if ($char <= 0xff) | |
$indices[] = array($i, 3); | |
} | |
// We could do 4, 5 and 6 byte sequences, but they overflow UCS-2 | |
} | |
return $indices; | |
} | |
function decodeUtf8(array &$utf16, array &$indices) | |
{ | |
$numIndices = count($indices); | |
for ($i = 0; $i < $numIndices; $i++) | |
{ | |
$index = $indices[$i][0]; | |
$size = 0; | |
if (($utf16[$index] & 0xe0) === 0xc0) | |
{ | |
$size = 1; | |
$char = | |
($utf16[$index + 0] & 0x1f) << 6 | | |
($utf16[$index + 1] & 0x3f); | |
} | |
if (($utf16[$index] & 0xf0) === 0xe0) | |
{ | |
$size = 2; | |
$char = | |
($utf16[$index + 0] & 0x0f) << 12 | | |
($utf16[$index + 1] & 0x3f) << 6 | | |
($utf16[$index + 2] & 0x3f); | |
} | |
if ($size === 0) | |
continue; | |
// Remove the old | |
array_splice($utf16, $index, $size + 1, $char); | |
// Update offsets | |
for ($j = $i + 1; $j < $numIndices; $j++) | |
{ | |
$indices[$j][0] -= $size; | |
$indices[$j][1] = 1; | |
} | |
} | |
} | |
function binToUtf16($data) | |
{ | |
return array_values(unpack('S*', $data)); | |
} | |
function translateUtf16(array $utf16, array $translations = array(), $reverse = false) | |
{ | |
if($reverse) | |
$translations = array_flip($translations); | |
// Do translations from foreign charset | |
$utf16Count = count($utf16); | |
for ($i = 0; $i < $utf16Count; $i++) | |
if (array_key_exists($utf16[$i], $translations)) | |
$utf16[$i] = $translations[$utf16[$i]]; | |
return $utf16; | |
} | |
function utf16ToUtf8(array $utf16) | |
{ | |
$utf16str = implode(array_map(function($c) | |
{ | |
return pack('S', $c); | |
}, $utf16)); | |
return iconv('UTF-16LE', 'UTF-8', $utf16str); | |
} | |
/** | |
* Gets the LIKE query for all probable utf-8 characters, with the specified charset | |
*/ | |
function getLike($charset = 'windows-1252') | |
{ | |
$likes = array(); | |
for ($i = 128; $i < 256; $i++) | |
{ | |
if (($i & 0xe0) === 0xc0 || ($i & 0xf0) === 0xe0) | |
{ | |
$c = @array_shift(unpack('S', iconv($charset, 'UTF-16LE//IGNORE', chr($i)))); | |
if ($c !== null) | |
$likes[] = "NCHAR($c)"; | |
} | |
} | |
return implode(' + ', $likes); | |
} | |
function getTranslations($charset = 'windows-1252') | |
{ | |
$translations = array(); | |
for ($i = 128; $i < 256; $i++) | |
{ | |
// TODO: Is this check needed? | |
// if (($i & 0xe0) === 0xc0 || ($i & 0xf0) === 0xe0) | |
{ | |
$c = @array_shift(unpack('S', iconv($charset, 'UTF-16LE//IGNORE', chr($i)))); | |
if ($c !== null && $c !== $i) | |
$translations[$c] = $i; | |
} | |
} | |
return $translations; | |
} | |
function addHtml($string, array $indices, $use1 = false) | |
{ | |
$prefix = '<span style="font-weight: bold; color: red;">'; | |
$suffix = '</span>'; | |
$addedSize = mb_strlen($prefix) + mb_strlen($suffix); | |
$numIndices = count($indices); | |
for ($i = 0; $i < $numIndices; $i++) | |
{ | |
$index = $indices[$i][0]; | |
$size = $use1 ? 1 : $indices[$i][1]; | |
$p1 = mb_substr($string, 0, $index); | |
$p2 = mb_substr($string, $index, $size); | |
$p3 = mb_substr($string, $index + $size); | |
$string = $p1 . $prefix . $p2 . $suffix . $p3; | |
for ($j = $i + 1; $j < $numIndices; $j++) | |
$indices[$j][0] += $addedSize; | |
} | |
return $string; | |
} | |
function buildTable(array $titles, array $table) | |
{ | |
?> | |
<html> | |
<head> | |
<style type="text/css"> | |
* { | |
font-family: Georgia, SERIF | |
} | |
table { | |
border-collapse: collapse; | |
} | |
td { | |
border: 1px solid gray; | |
padding: 5px; | |
} | |
thead td { | |
text-align: center; | |
font-weight: bold; | |
} | |
</style> | |
</head> | |
<body> | |
<?php | |
echo '<table>'; | |
echo '<thead><tr>'; | |
foreach ($titles as $title) | |
echo '<td>' . $title . '</td>'; | |
echo '</tr></thead>'; | |
echo '<tbody>'; | |
foreach ($table as $row) | |
{ | |
echo '<tr>'; | |
foreach ($row as $field) | |
echo '<td>' . $field . '</td>' . PHP_EOL; | |
echo '</tr>'; | |
} | |
echo '</tbody>'; | |
echo '</table>'; | |
?> | |
</body> | |
</html> | |
<?php | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment