Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
[groovy]文字コード判定。icu4j使用。
// g100pon #32 文字コード判定
import com.ibm.icu.text.CharsetDetector
@Grab(group='com.ibm.icu', module='icu4j', version='4.0.1')
def detector = new CharsetDetector()
//ファイルは先に用意してあります...
//ISO-2022-JP
def bytes1 = new File('/works/test/iso2022-jp.txt').getBytes()
def name1 = detector.setText(bytes1).detect().getName()
assert name1 == 'ISO-2022-JP'
//UTF-8
def bytes2 = new File('/works/test/utf8.txt').getBytes()
def name2 = detector.setText(bytes2).detect().getName()
assert name2 == 'UTF-8'
//EUC-JP
def bytes3 = new File('/works/test/eucjp.txt').getBytes()
def name3 = detector.setText(bytes3).detect().getName()
assert name3 == 'EUC-JP'
//Shift_JIS
def bytes4 = new File('/works/test/sjis.txt').getBytes()
def name4 = detector.setText(bytes4).detect().getName()
assert name4 == 'Shift_JIS'
//CP932
def bytes5 = new File('/works/test/cp932.txt').getBytes()
def name5 = detector.setText(bytes5).detect().getName()
assert name5 == 'Shift_JIS' //※CP932はShift_JISとして認識されてしまう。機種依存文字を入れてもNG。
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment