Created
October 10, 2010 10:28
-
-
Save kanemu/619125 to your computer and use it in GitHub Desktop.
[groovy]文字コード判定。icu4j使用。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// g100pon #32 文字コード判定 | |
import com.ibm.icu.text.CharsetDetector | |
@Grab(group='com.ibm.icu', module='icu4j', version='4.0.1') | |
def detector = new CharsetDetector() | |
//ファイルは先に用意してあります... | |
//ISO-2022-JP | |
def bytes1 = new File('/works/test/iso2022-jp.txt').getBytes() | |
def name1 = detector.setText(bytes1).detect().getName() | |
assert name1 == 'ISO-2022-JP' | |
//UTF-8 | |
def bytes2 = new File('/works/test/utf8.txt').getBytes() | |
def name2 = detector.setText(bytes2).detect().getName() | |
assert name2 == 'UTF-8' | |
//EUC-JP | |
def bytes3 = new File('/works/test/eucjp.txt').getBytes() | |
def name3 = detector.setText(bytes3).detect().getName() | |
assert name3 == 'EUC-JP' | |
//Shift_JIS | |
def bytes4 = new File('/works/test/sjis.txt').getBytes() | |
def name4 = detector.setText(bytes4).detect().getName() | |
assert name4 == 'Shift_JIS' | |
//CP932 | |
def bytes5 = new File('/works/test/cp932.txt').getBytes() | |
def name5 = detector.setText(bytes5).detect().getName() | |
assert name5 == 'Shift_JIS' //※CP932はShift_JISとして認識されてしまう。機種依存文字を入れてもNG。 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment