Last active
June 28, 2019 09:30
-
-
Save yourtion/a2a6d43f14f12079051b to your computer and use it in GitHub Desktop.
词库文件导出 (搜狗拼音输入法SCEL词库文件解析 、QQ拼音qpyd词库文件解析 、百度拼音输入法BCD词库文件解析 、Lingoes灵格斯电子词典LD2(LDF)文件解析)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* 百度拼音输入法BCD词库文件解析 | |
* Copyright (c) 2010 Xiaoyun Zhu | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.ByteOrder; | |
import java.nio.channels.FileChannel; | |
import cn.kk.kkdict.utils.Helper; | |
/** | |
* Baidu Pinyin IME BDICT File Reader | |
* | |
* <pre> | |
* BDICT Format overview: | |
* | |
* General Information: | |
* - Chinese characters and pinyin are all encoded with UTF-16LE. | |
* - Numbers are using little endian byte order. | |
* | |
* BDICT hex analysis: | |
* - 0x250 total number of words | |
* - 0x350 dictionary offset | |
* - 0x<Offset> Dictionary | |
* | |
* Dictionary format: | |
* - It can interpreted as a list of | |
* [amount of characters (short not integer!) | |
* pinyin construction using fenmu and yunmu, | |
* word as string | |
* ]. | |
* | |
* </pre> | |
* | |
* @author keke | |
*/ | |
public class BaiduBcdReader { | |
private static final String[] FEN_MU = { "c", "d", "b", "f", "g", "h", "ch", "j", "k", "l", "m", "n", "", "p", "q", "r", "s", "t", "sh", "zh", "w", "x", "y", | |
"z" }; | |
private static final String[] YUN_MU = { "uang", "iang", "ong", "ang", "eng", "ian", "iao", "ing", "ong", "uai", "uan", "ai", "an", "ao", "ei", "en", "er", | |
"ua", "ie", "in", "iu", "ou", "ia", "ue", "ui", "un", "uo", "a", "e", "i", "a", "u", "v" }; | |
public static void main(final String[] args) { | |
// download from http://r6.mo.baidu.com/web/iw/index/ | |
final String bdictFile = args[0]; | |
BaiduBcdReader.analyze(bdictFile); | |
} | |
@SuppressWarnings("resource") | |
private static void analyze(final String bdictFile) { | |
// read bdict into byte array | |
RandomAccessFile file = null; | |
try { | |
file = new RandomAccessFile(bdictFile, "r"); | |
final FileChannel fChannel = file.getChannel(); | |
final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); | |
fChannel.read(dataRawBytes); | |
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); | |
dataRawBytes.rewind(); | |
fChannel.close(); | |
System.out.println("文件: " + bdictFile); | |
final byte[] buf = new byte[1024]; | |
final int total = dataRawBytes.getInt(0x250); | |
// dictionary offset | |
dataRawBytes.position(0x350); | |
for (int i = 0; i < total; i++) { | |
final int length = dataRawBytes.getShort(); | |
dataRawBytes.getShort(); | |
boolean first = true; | |
final StringBuilder pinyin = new StringBuilder(); | |
for (int j = 0; j < length; j++) { | |
if (first) { | |
first = false; | |
} else { | |
pinyin.append('\''); | |
} | |
pinyin.append(BaiduBcdReader.FEN_MU[dataRawBytes.get()] + BaiduBcdReader.YUN_MU[dataRawBytes.get()]); | |
} | |
dataRawBytes.get(buf, 0, 2 * length); | |
final String word = new String(buf, 0, 2 * length, "UTF-16LE"); | |
System.out.println(word + "\t" + pinyin); | |
} | |
System.out.println("\nExtracted '" + bdictFile + "': " + total); | |
} catch (IOException e) { | |
System.err.println("Error: " + e); | |
} finally { | |
Helper.close(file); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Lingoes灵格斯电子词典LD2(LDF)文件解析 | |
* Copyright (c) 2010 Xiaoyun Zhu | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
import java.io.ByteArrayInputStream; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.OutputStream; | |
import java.io.RandomAccessFile; | |
import java.io.UnsupportedEncodingException; | |
import java.nio.ByteBuffer; | |
import java.nio.ByteOrder; | |
import java.nio.CharBuffer; | |
import java.nio.channels.FileChannel; | |
import java.nio.charset.CharacterCodingException; | |
import java.nio.charset.Charset; | |
import java.nio.charset.CharsetDecoder; | |
import java.nio.charset.CoderResult; | |
import java.nio.charset.CodingErrorAction; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.util.zip.Inflater; | |
import java.util.zip.InflaterInputStream; | |
/** | |
* Lingoes LD2/LDF File Reader | |
* | |
* <pre> | |
* Lingoes Format overview: | |
* | |
* General Information: | |
* - Dictionary data are stored in deflate streams. | |
* - Index group information is stored in an index array in the LD2 file itself. | |
* - Numbers are using little endian byte order. | |
* - Definitions and xml data have UTF-8 or UTF-16LE encodings. | |
* | |
* LD2 file schema: | |
* - File Header | |
* - File Description | |
* - Additional Information (optional) | |
* - Index Group (corresponds to definitions in dictionary) | |
* - Deflated Dictionary Streams | |
* -- Index Data | |
* --- Offsets of definitions | |
* --- Offsets of translations | |
* --- Flags | |
* --- References to other translations | |
* -- Definitions | |
* -- Translations (xml) | |
* | |
* TODO: find encoding / language fields to replace auto-detect of encodings | |
* | |
* </pre> | |
* | |
* @author keke | |
* | |
*/ | |
public class LingoesLd2Reader { | |
private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")), | |
new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")), | |
new SensitiveStringDecoder(Charset.forName("EUC-JP")) }; | |
public static void main(final String[] args) throws IOException { | |
// download from | |
// https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents | |
// String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2"; | |
final String ld2File = args[0]; | |
// read lingoes ld2 into byte array | |
final ByteBuffer dataRawBytes; | |
try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) { | |
dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); | |
fChannel.read(dataRawBytes); | |
} | |
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); | |
dataRawBytes.rewind(); | |
System.out.println("文件:" + ld2File); | |
System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII")); | |
System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A)); | |
System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C))); | |
final int offsetData = dataRawBytes.getInt(0x5C) + 0x60; | |
if (dataRawBytes.limit() > offsetData) { | |
System.out.println("简介地址:0x" + Integer.toHexString(offsetData)); | |
final int type = dataRawBytes.getInt(offsetData); | |
System.out.println("简介类型:0x" + Integer.toHexString(type)); | |
final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12; | |
if (type == 3) { | |
// without additional information | |
LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData); | |
} else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) { | |
LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo); | |
} else { | |
System.err.println("文件不包含字典数据。网上字典?"); | |
} | |
} else { | |
System.err.println("文件不包含字典数据。网上字典?"); | |
} | |
} | |
private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append) | |
throws IOException { | |
final Inflater inflator = new Inflater(); | |
try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8); | |
final FileOutputStream out = new FileOutputStream(inflatedFile, append);) { | |
LingoesLd2Reader.writeInputStream(in, out); | |
} | |
final long bytesRead = inflator.getBytesRead(); | |
inflator.end(); | |
return bytesRead; | |
} | |
private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal, | |
final int dataLen, final int[] idxData, final String[] defData) { | |
final int test = Math.min(defTotal, 10); | |
for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) { | |
for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) { | |
try { | |
for (int i = 0; i < test; i++) { | |
LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j], | |
LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i); | |
} | |
System.out.println("词组编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name); | |
System.out.println("XML编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name); | |
return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] }; | |
} catch (final Throwable e) { | |
// ignore | |
} | |
} | |
} | |
System.err.println("自动识别编码失败!选择UTF-16LE继续。"); | |
return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] }; | |
} | |
private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile, | |
final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException, | |
UnsupportedEncodingException { | |
System.out.println("写入'" + extractedOutputFile + "'。。。"); | |
int counter = 0; | |
try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r"); | |
final FileWriter indexWriter = new FileWriter(indexFile); | |
final FileWriter defsWriter = new FileWriter(extractedWordsFile); | |
final FileWriter xmlWriter = new FileWriter(extractedXmlFile); | |
final FileWriter outputWriter = new FileWriter(extractedOutputFile); | |
// read inflated data | |
final FileChannel fChannel = file.getChannel();) { | |
final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); | |
fChannel.read(dataRawBytes); | |
fChannel.close(); | |
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); | |
dataRawBytes.rewind(); | |
final int dataLen = 10; | |
final int defTotal = (offsetDefs / dataLen) - 1; | |
final String[] words = new String[defTotal]; | |
final int[] idxData = new int[6]; | |
final String[] defData = new String[2]; | |
final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData); | |
dataRawBytes.position(8); | |
for (int i = 0; i < defTotal; i++) { | |
LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i); | |
words[i] = defData[0]; | |
defsWriter.write(defData[0]); | |
defsWriter.write("\n"); | |
xmlWriter.write(defData[1]); | |
xmlWriter.write("\n"); | |
outputWriter.write(defData[0]); | |
outputWriter.write("="); | |
outputWriter.write(defData[1]); | |
outputWriter.write("\n"); | |
System.out.println(defData[0] + " = " + defData[1]); | |
counter++; | |
} | |
for (int i = 0; i < idxArray.length; i++) { | |
final int idx = idxArray[i]; | |
indexWriter.write(words[idx]); | |
indexWriter.write(", "); | |
indexWriter.write(String.valueOf(idx)); | |
indexWriter.write("\n"); | |
} | |
} | |
System.out.println("成功读出" + counter + "组数据。"); | |
} | |
private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) { | |
dataRawBytes.position(position); | |
wordIdxData[0] = dataRawBytes.getInt(); | |
wordIdxData[1] = dataRawBytes.getInt(); | |
wordIdxData[2] = dataRawBytes.get() & 0xff; | |
wordIdxData[3] = dataRawBytes.get() & 0xff; | |
wordIdxData[4] = dataRawBytes.getInt(); | |
wordIdxData[5] = dataRawBytes.getInt(); | |
} | |
private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) { | |
System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。"); | |
final int startOffset = dataRawBytes.position(); | |
int offset = -1; | |
int lastOffset = startOffset; | |
boolean append = false; | |
try { | |
for (final Integer offsetRelative : deflateStreams) { | |
offset = startOffset + offsetRelative.intValue(); | |
LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append); | |
append = true; | |
lastOffset = offset; | |
} | |
} catch (final Throwable e) { | |
System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString()); | |
} | |
} | |
private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen, | |
final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) { | |
LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData); | |
int lastWordPos = idxData[0]; | |
int lastXmlPos = idxData[1]; | |
// final int flags = idxData[2]; | |
int refs = idxData[3]; | |
final int currentWordOffset = idxData[4]; | |
int currenXmlOffset = idxData[5]; | |
String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); | |
while (refs-- > 0) { | |
final int ref = inflatedBytes.getInt(offsetWords + lastWordPos); | |
LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData); | |
lastXmlPos = idxData[1]; | |
currenXmlOffset = idxData[5]; | |
if (xml.isEmpty()) { | |
xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))); | |
} else { | |
xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", " | |
+ xml; | |
} | |
lastWordPos += 4; | |
} | |
defData[1] = xml; | |
final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos)); | |
defData[0] = word; | |
} | |
private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException, | |
FileNotFoundException, UnsupportedEncodingException { | |
System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex))); | |
final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8; | |
final int offsetIndex = offsetWithIndex + 0x1C; | |
final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex; | |
final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12); | |
final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16); | |
final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20); | |
final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4; | |
final List<Integer> deflateStreams = new ArrayList<>(); | |
dataRawBytes.position(offsetCompressedDataHeader + 8); | |
int offset = dataRawBytes.getInt(); | |
while ((offset + dataRawBytes.position()) < limit) { | |
offset = dataRawBytes.getInt(); | |
deflateStreams.add(Integer.valueOf(offset)); | |
} | |
final int offsetCompressedData = dataRawBytes.position(); | |
System.out.println("索引词组数目:" + definitions); | |
System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B"); | |
System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B"); | |
System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B"); | |
System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B"); | |
System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B"); | |
System.out.println("文件大小(解压缩后):" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB"); | |
final String inflatedFile = ld2File + ".inflated"; | |
LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile); | |
if (new File(inflatedFile).isFile()) { | |
final String indexFile = ld2File + ".idx"; | |
final String extractedFile = ld2File + ".words"; | |
final String extractedXmlFile = ld2File + ".xml"; | |
final String extractedOutputFile = ld2File + ".output"; | |
dataRawBytes.position(offsetIndex); | |
final int[] idxArray = new int[definitions]; | |
for (int i = 0; i < definitions; i++) { | |
idxArray[i] = dataRawBytes.getInt(); | |
} | |
LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength, | |
inflatedWordsIndexLength + inflatedWordsLength); | |
} | |
} | |
private static final String strip(final String xml) { | |
int open = 0; | |
int end = 0; | |
if ((open = xml.indexOf("<![CDATA[")) != -1) { | |
if ((end = xml.indexOf("]]>", open)) != -1) { | |
return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); | |
} | |
} else if ((open = xml.indexOf("<Ô")) != -1) { | |
if ((end = xml.indexOf("</Ô", open)) != -1) { | |
open = xml.indexOf(">", open + 1); | |
return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); | |
} | |
} else { | |
final StringBuilder sb = new StringBuilder(); | |
end = 0; | |
open = xml.indexOf('<'); | |
do { | |
if ((open - end) > 1) { | |
sb.append(xml.substring(end + 1, open)); | |
} | |
open = xml.indexOf('<', open + 1); | |
end = xml.indexOf('>', end + 1); | |
} while ((open != -1) && (end != -1)); | |
return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' '); | |
} | |
return ""; | |
} | |
private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException { | |
final byte[] buffer = new byte[1024 * 8]; | |
int len; | |
while ((len = in.read(buffer)) > 0) { | |
out.write(buffer, 0, len); | |
} | |
} | |
private static class SensitiveStringDecoder { | |
public final String name; | |
private final CharsetDecoder cd; | |
SensitiveStringDecoder(final Charset cs) { | |
this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT); | |
this.name = cs.name(); | |
} | |
char[] decode(final byte[] ba, final int off, final int len) { | |
final int en = (int) (len * (double) this.cd.maxCharsPerByte()); | |
final char[] ca = new char[en]; | |
if (len == 0) { | |
return ca; | |
} | |
this.cd.reset(); | |
final ByteBuffer bb = ByteBuffer.wrap(ba, off, len); | |
final CharBuffer cb = CharBuffer.wrap(ca); | |
try { | |
CoderResult cr = this.cd.decode(bb, cb, true); | |
if (!cr.isUnderflow()) { | |
cr.throwException(); | |
} | |
cr = this.cd.flush(cb); | |
if (!cr.isUnderflow()) { | |
cr.throwException(); | |
} | |
} catch (final CharacterCodingException x) { | |
// Substitution is always enabled, | |
// so this shouldn't happen | |
throw new Error(x); | |
} | |
return SensitiveStringDecoder.safeTrim(ca, cb.position()); | |
} | |
private static char[] safeTrim(final char[] ca, final int len) { | |
if (len == ca.length) { | |
return ca; | |
} else { | |
return Arrays.copyOf(ca, len); | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* QQ拼音qpyd词库文件解析 | |
* Copyright (c) 2010 Xiaoyun Zhu | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
import java.io.ByteArrayOutputStream; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.ByteOrder; | |
import java.nio.channels.Channels; | |
import java.nio.channels.FileChannel; | |
import java.util.Arrays; | |
import java.util.zip.InflaterOutputStream; | |
/** | |
* QQ Pinyin IME QPYD File Reader | |
* | |
* <pre> | |
* QPYD Format overview: | |
* | |
* General Information: | |
* - Chinese characters are all encoded with UTF-16LE. | |
* - Pinyin are encoded in ascii (or UTF-8). | |
* - Numbers are using little endian byte order. | |
* | |
* QPYD hex analysis: | |
* - 0x00 QPYD file identifier | |
* - 0x38 offset of compressed data (word-pinyin-dictionary) | |
* - 0x44 total words in qpyd | |
* - 0x60 start of header information | |
* | |
* Compressed data analysis: | |
* - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files | |
* - data is divided in two parts | |
* -- 1. offset and length information (16 bytes for each pinyin-word pair) | |
* 0x06 offset points to first pinyin | |
* 0x00 length of pinyin | |
* 0x01 length of word | |
* -- 2. actual data | |
* Dictionary data has the form ((pinyin)(word))* with no separators. | |
* Data can only be read using offset and length information. | |
* | |
* </pre> | |
* | |
*/ | |
public class QQPinyinQpydReader { | |
public static void main(final String[] args) throws IOException { | |
// download from http://dict.py.qq.com/list.php | |
final String qqydFile = args[0]; | |
// read qpyd into byte array | |
final ByteArrayOutputStream dataOut = new ByteArrayOutputStream(); | |
try (RandomAccessFile file = new RandomAccessFile(qqydFile, "r"); final FileChannel fChannel = file.getChannel();) { | |
fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut)); | |
} | |
// qpyd as bytes | |
final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray()); | |
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); | |
System.out.println("文件: " + qqydFile); | |
// read info of compressed data | |
final int startZippedDictAddr = dataRawBytes.getInt(0x38); | |
final int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr; | |
// qpys as UTF-16LE string | |
final String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE"); | |
// print header | |
System.out.println("名称:" + QQPinyinQpydReader.substringBetween(dataString, "Name: ", "\r\n")); | |
System.out.println("类型:" + QQPinyinQpydReader.substringBetween(dataString, "Type: ", "\r\n")); | |
System.out.println("子类型:" + QQPinyinQpydReader.substringBetween(dataString, "FirstType: ", "\r\n")); | |
System.out.println("词库说明:" + QQPinyinQpydReader.substringBetween(dataString, "Intro: ", "\r\n")); | |
System.out.println("词库样例:" + QQPinyinQpydReader.substringBetween(dataString, "Example: ", "\r\n")); | |
System.out.println("词条数:" + dataRawBytes.getInt(0x44)); | |
// read zipped qqyd dictionary into byte array | |
dataOut.reset(); | |
try (InflaterOutputStream inflater = new InflaterOutputStream(dataOut);) { | |
Channels.newChannel(inflater).write(ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength)); | |
} | |
// uncompressed qqyd dictionary as bytes | |
final ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray()); | |
dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN); | |
// for debugging: save unzipped data to *.unzipped file | |
try (FileOutputStream out = new FileOutputStream(qqydFile + ".unzipped");) { | |
Channels.newChannel(out).write(dataUnzippedBytes); | |
System.out.println("压缩数据:0x" + Integer.toHexString(startZippedDictAddr) + " (解压前:" + zippedDictLength + " B, 解压后:" + dataUnzippedBytes.limit() + " B)"); | |
} | |
// stores the start address of actual dictionary data | |
int unzippedDictStartAddr = -1; | |
final byte[] byteArray = dataUnzippedBytes.array(); | |
dataUnzippedBytes.position(0); | |
while ((unzippedDictStartAddr == -1) || (dataUnzippedBytes.position() < unzippedDictStartAddr)) { | |
// read word | |
final int pinyinLength = dataUnzippedBytes.get() & 0xff; | |
final int wordLength = dataUnzippedBytes.get() & 0xff; | |
dataUnzippedBytes.getInt(); // garbage | |
final int pinyinStartAddr = dataUnzippedBytes.getInt(); | |
final int wordStartAddr = pinyinStartAddr + pinyinLength; | |
if (unzippedDictStartAddr == -1) { | |
unzippedDictStartAddr = pinyinStartAddr; | |
System.out.println("词库地址(解压后):0x" + Integer.toHexString(unzippedDictStartAddr) + "\n"); | |
} | |
final String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength), "UTF-8"); | |
final String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), "UTF-16LE"); | |
System.out.println(word + "\t" + pinyin); | |
} | |
} | |
public static final String substringBetween(final String text, final String start, final String end) { | |
final int nStart = text.indexOf(start); | |
final int nEnd = text.indexOf(end, nStart + 1); | |
if ((nStart != -1) && (nEnd != -1)) { | |
return text.substring(nStart + start.length(), nEnd); | |
} else { | |
return null; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* 搜狗拼音输入法SCEL词库文件解析 | |
* Copyright (c) 2010 Xiaoyun Zhu | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
import java.io.ByteArrayOutputStream; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.ByteOrder; | |
import java.nio.channels.Channels; | |
import java.nio.channels.FileChannel; | |
/** | |
* Sougou Pinyin IME SCEL File Reader | |
* | |
* <pre> | |
* SCEL Format overview: | |
* | |
* General Information: | |
* - Chinese characters and pinyin are all encoded with UTF-16LE. | |
* - Numbers are using little endian byte order. | |
* | |
* SCEL hex analysis: | |
* - 0x0 Pinyin List Offset | |
* - 0x120 total number of words | |
* - 0x<PY-Offset> total number of pinyin | |
* - ... List of pinyin as [index, byte length of pinyin, pinyin as string] triples | |
* - ... Dictionary | |
* - ... <additional garbage> | |
* | |
* Dictionary format: | |
* - It can interpreted as a list of | |
* [alternatives of words, | |
* byte length of pinyin indexes, pinyin indexes, | |
* [byte length of word, word as string, length of skip bytes, skip bytes] | |
* ... (alternatives) | |
* ]. | |
* | |
* </pre> | |
* | |
*/ | |
class SogouScelReader { | |
public static void main(final String[] args) throws IOException { | |
// download from http://pinyin.sogou.com/dict | |
final String scelFile = args[0]; | |
// read scel into byte array | |
final ByteArrayOutputStream dataOut = new ByteArrayOutputStream(); | |
try (RandomAccessFile file = new RandomAccessFile(scelFile, "r"); final FileChannel fChannel = file.getChannel();) { | |
fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut)); | |
} | |
// scel as bytes | |
final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray()); | |
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); | |
System.out.println("文件: " + scelFile); | |
final byte[] buf = new byte[1024]; | |
final String[] pyDict = new String[512]; | |
final int totalWords = dataRawBytes.getInt(0x120); | |
// pinyin offset | |
dataRawBytes.position(dataRawBytes.getInt()); | |
final int totalPinyin = dataRawBytes.getInt(); | |
for (int i = 0; i < totalPinyin; i++) { | |
final int idx = dataRawBytes.getShort(); | |
final int len = dataRawBytes.getShort(); | |
dataRawBytes.get(buf, 0, len); | |
pyDict[idx] = new String(buf, 0, len, "UTF-16LE"); | |
} | |
// extract dictionary | |
int counter = 0; | |
for (int i = 0; i < totalWords; i++) { | |
final StringBuilder py = new StringBuilder(); | |
final StringBuilder word = new StringBuilder(); | |
int alternatives = dataRawBytes.getShort(); | |
int pyLength = dataRawBytes.getShort() / 2; | |
boolean first = true; | |
while (pyLength-- > 0) { | |
final int key = dataRawBytes.getShort(); | |
if (first) { | |
first = false; | |
} else { | |
py.append('\''); | |
} | |
py.append(pyDict[key]); | |
} | |
first = true; | |
while (alternatives-- > 0) { | |
if (first) { | |
first = false; | |
} else { | |
word.append(", "); | |
} | |
final int wordlength = dataRawBytes.getShort(); | |
dataRawBytes.get(buf, 0, wordlength); | |
word.append(new String(buf, 0, wordlength, "UTF-16LE")); | |
// skip bytes | |
dataRawBytes.get(buf, 0, dataRawBytes.getShort()); | |
} | |
System.out.println(word.toString() + "\t" + py.toString()); | |
counter++; | |
} | |
System.out.println("\n读出词汇'" + scelFile + "': " + counter); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
USE: