Skip to content

Instantly share code, notes, and snippets.

@DrayChou
Created December 23, 2016 03:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DrayChou/47a73f7b7d04842ab87dd5c98ce7b3fd to your computer and use it in GitHub Desktop.
Save DrayChou/47a73f7b7d04842ab87dd5c98ce7b3fd to your computer and use it in GitHub Desktop.
<?php
ini_set('memory_limit', '4G');
if (mb_internal_encoding() != "UTF-8") {
mb_internal_encoding("UTF-8");
}
$file = "test2.txt"; // 一个编码为gbk的中文文件
// $encodeings = mb_list_encodings();
// array_unshift($encodeings, 'GBK');
// var_dump($encodeings);
// file_put_contents("test2-code.txt", print_r($encodeings, true));
// $bef = mb_detect_encoding($str, $encodeings, true);
// $str = mb_convert_encoding($str, "UTF-8", $bef);
// file_put_contents("test2-1_{$bef}.txt", "\xEF\xBB\xBF" . $str);
// foreach ($encodeings as $v) {
// //转换为 UTF-8
// if (mb_check_encoding($str, $v)) {
// $str = mb_convert_encoding($str, "UTF-8", $v);
// file_put_contents("test2-2_{$v}.txt", "\xEF\xBB\xBF" . $str);
// }
// }
// $encodeings = [
// 'UTF-32',
// 'UTF-32BE',
// 'UTF-32LE',
// 'UTF-16',
// 'UTF-16BE',
// 'UTF-16LE',
// 'UTF-8',
// 'UTF-7',
// 'UTF7-IMAP',
// 'ASCII',
// 'EUC-JP',
// 'SJIS',
// 'eucJP-win',
// 'EUC-JP-2004',
// 'CP932',
// 'CP51932',
// 'JIS',
// 'ISO-2022-JP',
// 'ISO-2022-JP-MS',
// 'GB18030',
// 'Windows-1252',
// 'Windows-1254',
// 'ISO-8859-1',
// 'ISO-8859-2',
// 'ISO-8859-3',
// 'ISO-8859-4',
// 'ISO-8859-5',
// 'ISO-8859-6',
// 'ISO-8859-7',
// 'ISO-8859-8',
// 'ISO-8859-9',
// 'ISO-8859-10',
// 'ISO-8859-13',
// 'ISO-8859-14',
// 'ISO-8859-15',
// 'ISO-8859-16',
// 'EUC-CN',
// 'CP936',
// 'HZ',
// 'EUC-TW',
// 'BIG-5',
// 'CP950',
// 'EUC-KR',
// 'UHC',
// 'ISO-2022-KR',
// 'Windows-1251',
// 'CP866',
// 'KOI8-R',
// 'KOI8-U',
// 'ArmSCII-8',
// 'CP850',
// 'JIS-ms',
// 'ISO-2022-JP-2004',
// 'CP50220',
// 'CP50220raw',
// 'CP50221',
// 'CP50222',
// ];
// $str = file_get_contents($file);
// $bef = mb_detect_encoding($str, $encodeings, true);
// $str = mb_convert_encoding($str, "UTF-8", $bef);
// file_put_contents("test2-3_{$bef}.txt", $str);
// foreach ($encodeings as $v) {
// //转换为 UTF-8
// $str = file_get_contents($file);
// if (mb_check_encoding($str, $v)) {
// $str = mb_convert_encoding($str, "UTF-8", $v);
// file_put_contents("test2-4_{$v}.txt", $str);
// }
// }
$file = "test2.txt";
function list_encodings($file)
{
$res = [];
$list = mb_list_encodings();
$str = file_get_contents($file);
foreach ($list as $item) {
if (in_array($item, ['pass', 'auto', 'wchar', 'byte2be', 'byte2le', '8bit'])) {
continue;
}
// $tmp1 = @mb_convert_encoding($str, $item, $item);
// if (md5($tmp1) != md5($str)) {
// continue;
// }
$tmp2 = @iconv($item, $item, $str);
if (md5($tmp2) != md5($str)) {
continue;
}
$res[] = $item;
}
return $res;
}
$encodeings = list_encodings($file);
var_dump($encodeings);
$str = file_get_contents($file);
$bef = mb_detect_encoding($str, $encodeings, true);
$str = mb_convert_encoding($str, "UTF-8", $bef);
file_put_contents("test2-5_{$bef}.txt", $str);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment