Skip to content

Instantly share code, notes, and snippets.

@yourtion
Last active June 28, 2019 09:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yourtion/a2a6d43f14f12079051b to your computer and use it in GitHub Desktop.
Save yourtion/a2a6d43f14f12079051b to your computer and use it in GitHub Desktop.
词库文件导出 (搜狗拼音输入法SCEL词库文件解析 、QQ拼音qpyd词库文件解析 、百度拼音输入法BCD词库文件解析 、Lingoes灵格斯电子词典LD2(LDF)文件解析)
/* 百度拼音输入法BCD词库文件解析
* Copyright (c) 2010 Xiaoyun Zhu
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import cn.kk.kkdict.utils.Helper;
/**
* Baidu Pinyin IME BDICT File Reader
*
* <pre>
* BDICT Format overview:
*
* General Information:
* - Chinese characters and pinyin are all encoded with UTF-16LE.
* - Numbers are using little endian byte order.
*
* BDICT hex analysis:
* - 0x250 total number of words
* - 0x350 dictionary offset
* - 0x<Offset> Dictionary
*
* Dictionary format:
* - It can interpreted as a list of
* [amount of characters (short not integer!)
* pinyin construction using fenmu and yunmu,
* word as string
* ].
*
* </pre>
*
* @author keke
*/
public class BaiduBcdReader {
private static final String[] FEN_MU = { "c", "d", "b", "f", "g", "h", "ch", "j", "k", "l", "m", "n", "", "p", "q", "r", "s", "t", "sh", "zh", "w", "x", "y",
"z" };
private static final String[] YUN_MU = { "uang", "iang", "ong", "ang", "eng", "ian", "iao", "ing", "ong", "uai", "uan", "ai", "an", "ao", "ei", "en", "er",
"ua", "ie", "in", "iu", "ou", "ia", "ue", "ui", "un", "uo", "a", "e", "i", "a", "u", "v" };
public static void main(final String[] args) {
// download from http://r6.mo.baidu.com/web/iw/index/
final String bdictFile = args[0];
BaiduBcdReader.analyze(bdictFile);
}
@SuppressWarnings("resource")
private static void analyze(final String bdictFile) {
// read bdict into byte array
RandomAccessFile file = null;
try {
file = new RandomAccessFile(bdictFile, "r");
final FileChannel fChannel = file.getChannel();
final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
fChannel.read(dataRawBytes);
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
dataRawBytes.rewind();
fChannel.close();
System.out.println("文件: " + bdictFile);
final byte[] buf = new byte[1024];
final int total = dataRawBytes.getInt(0x250);
// dictionary offset
dataRawBytes.position(0x350);
for (int i = 0; i < total; i++) {
final int length = dataRawBytes.getShort();
dataRawBytes.getShort();
boolean first = true;
final StringBuilder pinyin = new StringBuilder();
for (int j = 0; j < length; j++) {
if (first) {
first = false;
} else {
pinyin.append('\'');
}
pinyin.append(BaiduBcdReader.FEN_MU[dataRawBytes.get()] + BaiduBcdReader.YUN_MU[dataRawBytes.get()]);
}
dataRawBytes.get(buf, 0, 2 * length);
final String word = new String(buf, 0, 2 * length, "UTF-16LE");
System.out.println(word + "\t" + pinyin);
}
System.out.println("\nExtracted '" + bdictFile + "': " + total);
} catch (IOException e) {
System.err.println("Error: " + e);
} finally {
Helper.close(file);
}
}
}
/* Lingoes灵格斯电子词典LD2(LDF)文件解析
* Copyright (c) 2010 Xiaoyun Zhu
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
/**
* Lingoes LD2/LDF File Reader
*
* <pre>
* Lingoes Format overview:
*
* General Information:
* - Dictionary data are stored in deflate streams.
* - Index group information is stored in an index array in the LD2 file itself.
* - Numbers are using little endian byte order.
* - Definitions and xml data have UTF-8 or UTF-16LE encodings.
*
* LD2 file schema:
* - File Header
* - File Description
* - Additional Information (optional)
* - Index Group (corresponds to definitions in dictionary)
* - Deflated Dictionary Streams
* -- Index Data
* --- Offsets of definitions
* --- Offsets of translations
* --- Flags
* --- References to other translations
* -- Definitions
* -- Translations (xml)
*
* TODO: find encoding / language fields to replace auto-detect of encodings
*
* </pre>
*
* @author keke
*
*/
public class LingoesLd2Reader {
private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")),
new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")),
new SensitiveStringDecoder(Charset.forName("EUC-JP")) };
public static void main(final String[] args) throws IOException {
// download from
// https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents
// String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2";
final String ld2File = args[0];
// read lingoes ld2 into byte array
final ByteBuffer dataRawBytes;
try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) {
dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
fChannel.read(dataRawBytes);
}
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
dataRawBytes.rewind();
System.out.println("文件:" + ld2File);
System.out.println("类型:" + new String(dataRawBytes.array(), 0, 4, "ASCII"));
System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A));
System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));
final int offsetData = dataRawBytes.getInt(0x5C) + 0x60;
if (dataRawBytes.limit() > offsetData) {
System.out.println("简介地址:0x" + Integer.toHexString(offsetData));
final int type = dataRawBytes.getInt(offsetData);
System.out.println("简介类型:0x" + Integer.toHexString(type));
final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12;
if (type == 3) {
// without additional information
LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData);
} else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) {
LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo);
} else {
System.err.println("文件不包含字典数据。网上字典?");
}
} else {
System.err.println("文件不包含字典数据。网上字典?");
}
}
private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append)
throws IOException {
final Inflater inflator = new Inflater();
try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8);
final FileOutputStream out = new FileOutputStream(inflatedFile, append);) {
LingoesLd2Reader.writeInputStream(in, out);
}
final long bytesRead = inflator.getBytesRead();
inflator.end();
return bytesRead;
}
private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal,
final int dataLen, final int[] idxData, final String[] defData) {
final int test = Math.min(defTotal, 10);
for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) {
for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) {
try {
for (int i = 0; i < test; i++) {
LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j],
LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i);
}
System.out.println("词组编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name);
System.out.println("XML编码:" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name);
return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] };
} catch (final Throwable e) {
// ignore
}
}
}
System.err.println("自动识别编码失败!选择UTF-16LE继续。");
return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] };
}
private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile,
final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException,
UnsupportedEncodingException {
System.out.println("写入'" + extractedOutputFile + "'。。。");
int counter = 0;
try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r");
final FileWriter indexWriter = new FileWriter(indexFile);
final FileWriter defsWriter = new FileWriter(extractedWordsFile);
final FileWriter xmlWriter = new FileWriter(extractedXmlFile);
final FileWriter outputWriter = new FileWriter(extractedOutputFile);
// read inflated data
final FileChannel fChannel = file.getChannel();) {
final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
fChannel.read(dataRawBytes);
fChannel.close();
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
dataRawBytes.rewind();
final int dataLen = 10;
final int defTotal = (offsetDefs / dataLen) - 1;
final String[] words = new String[defTotal];
final int[] idxData = new int[6];
final String[] defData = new String[2];
final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData);
dataRawBytes.position(8);
for (int i = 0; i < defTotal; i++) {
LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i);
words[i] = defData[0];
defsWriter.write(defData[0]);
defsWriter.write("\n");
xmlWriter.write(defData[1]);
xmlWriter.write("\n");
outputWriter.write(defData[0]);
outputWriter.write("=");
outputWriter.write(defData[1]);
outputWriter.write("\n");
System.out.println(defData[0] + " = " + defData[1]);
counter++;
}
for (int i = 0; i < idxArray.length; i++) {
final int idx = idxArray[i];
indexWriter.write(words[idx]);
indexWriter.write(", ");
indexWriter.write(String.valueOf(idx));
indexWriter.write("\n");
}
}
System.out.println("成功读出" + counter + "组数据。");
}
private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) {
dataRawBytes.position(position);
wordIdxData[0] = dataRawBytes.getInt();
wordIdxData[1] = dataRawBytes.getInt();
wordIdxData[2] = dataRawBytes.get() & 0xff;
wordIdxData[3] = dataRawBytes.get() & 0xff;
wordIdxData[4] = dataRawBytes.getInt();
wordIdxData[5] = dataRawBytes.getInt();
}
private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) {
System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。");
final int startOffset = dataRawBytes.position();
int offset = -1;
int lastOffset = startOffset;
boolean append = false;
try {
for (final Integer offsetRelative : deflateStreams) {
offset = startOffset + offsetRelative.intValue();
LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append);
append = true;
lastOffset = offset;
}
} catch (final Throwable e) {
System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString());
}
}
private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen,
final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) {
LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData);
int lastWordPos = idxData[0];
int lastXmlPos = idxData[1];
// final int flags = idxData[2];
int refs = idxData[3];
final int currentWordOffset = idxData[4];
int currenXmlOffset = idxData[5];
String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
while (refs-- > 0) {
final int ref = inflatedBytes.getInt(offsetWords + lastWordPos);
LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData);
lastXmlPos = idxData[1];
currenXmlOffset = idxData[5];
if (xml.isEmpty()) {
xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
} else {
xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", "
+ xml;
}
lastWordPos += 4;
}
defData[1] = xml;
final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos));
defData[0] = word;
}
private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException,
FileNotFoundException, UnsupportedEncodingException {
System.out.println("词典类型:0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex)));
final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8;
final int offsetIndex = offsetWithIndex + 0x1C;
final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex;
final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12);
final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16);
final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20);
final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4;
final List<Integer> deflateStreams = new ArrayList<>();
dataRawBytes.position(offsetCompressedDataHeader + 8);
int offset = dataRawBytes.getInt();
while ((offset + dataRawBytes.position()) < limit) {
offset = dataRawBytes.getInt();
deflateStreams.add(Integer.valueOf(offset));
}
final int offsetCompressedData = dataRawBytes.position();
System.out.println("索引词组数目:" + definitions);
System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B");
System.out.println("压缩数据地址/大小:0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B");
System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength + " B");
System.out.println("词组地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B");
System.out.println("XML地址/大小(解压缩后):0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B");
System.out.println("文件大小(解压缩后):" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB");
final String inflatedFile = ld2File + ".inflated";
LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile);
if (new File(inflatedFile).isFile()) {
final String indexFile = ld2File + ".idx";
final String extractedFile = ld2File + ".words";
final String extractedXmlFile = ld2File + ".xml";
final String extractedOutputFile = ld2File + ".output";
dataRawBytes.position(offsetIndex);
final int[] idxArray = new int[definitions];
for (int i = 0; i < definitions; i++) {
idxArray[i] = dataRawBytes.getInt();
}
LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength,
inflatedWordsIndexLength + inflatedWordsLength);
}
}
private static final String strip(final String xml) {
int open = 0;
int end = 0;
if ((open = xml.indexOf("<![CDATA[")) != -1) {
if ((end = xml.indexOf("]]>", open)) != -1) {
return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
}
} else if ((open = xml.indexOf("<Ô")) != -1) {
if ((end = xml.indexOf("</Ô", open)) != -1) {
open = xml.indexOf(">", open + 1);
return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
}
} else {
final StringBuilder sb = new StringBuilder();
end = 0;
open = xml.indexOf('<');
do {
if ((open - end) > 1) {
sb.append(xml.substring(end + 1, open));
}
open = xml.indexOf('<', open + 1);
end = xml.indexOf('>', end + 1);
} while ((open != -1) && (end != -1));
return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
}
return "";
}
private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException {
final byte[] buffer = new byte[1024 * 8];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
}
private static class SensitiveStringDecoder {
public final String name;
private final CharsetDecoder cd;
SensitiveStringDecoder(final Charset cs) {
this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
this.name = cs.name();
}
char[] decode(final byte[] ba, final int off, final int len) {
final int en = (int) (len * (double) this.cd.maxCharsPerByte());
final char[] ca = new char[en];
if (len == 0) {
return ca;
}
this.cd.reset();
final ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
final CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = this.cd.decode(bb, cb, true);
if (!cr.isUnderflow()) {
cr.throwException();
}
cr = this.cd.flush(cb);
if (!cr.isUnderflow()) {
cr.throwException();
}
} catch (final CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return SensitiveStringDecoder.safeTrim(ca, cb.position());
}
private static char[] safeTrim(final char[] ca, final int len) {
if (len == ca.length) {
return ca;
} else {
return Arrays.copyOf(ca, len);
}
}
}
}
/* QQ拼音qpyd词库文件解析
* Copyright (c) 2010 Xiaoyun Zhu
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.zip.InflaterOutputStream;
/**
* QQ Pinyin IME QPYD File Reader
*
* <pre>
* QPYD Format overview:
*
* General Information:
* - Chinese characters are all encoded with UTF-16LE.
* - Pinyin are encoded in ascii (or UTF-8).
* - Numbers are using little endian byte order.
*
* QPYD hex analysis:
* - 0x00 QPYD file identifier
* - 0x38 offset of compressed data (word-pinyin-dictionary)
* - 0x44 total words in qpyd
* - 0x60 start of header information
*
* Compressed data analysis:
* - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files
* - data is divided in two parts
* -- 1. offset and length information (16 bytes for each pinyin-word pair)
* 0x06 offset points to first pinyin
* 0x00 length of pinyin
* 0x01 length of word
* -- 2. actual data
* Dictionary data has the form ((pinyin)(word))* with no separators.
* Data can only be read using offset and length information.
*
* </pre>
*
*/
public class QQPinyinQpydReader {
public static void main(final String[] args) throws IOException {
// download from http://dict.py.qq.com/list.php
final String qqydFile = args[0];
// read qpyd into byte array
final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
try (RandomAccessFile file = new RandomAccessFile(qqydFile, "r"); final FileChannel fChannel = file.getChannel();) {
fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
}
// qpyd as bytes
final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
System.out.println("文件: " + qqydFile);
// read info of compressed data
final int startZippedDictAddr = dataRawBytes.getInt(0x38);
final int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr;
// qpys as UTF-16LE string
final String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE");
// print header
System.out.println("名称:" + QQPinyinQpydReader.substringBetween(dataString, "Name: ", "\r\n"));
System.out.println("类型:" + QQPinyinQpydReader.substringBetween(dataString, "Type: ", "\r\n"));
System.out.println("子类型:" + QQPinyinQpydReader.substringBetween(dataString, "FirstType: ", "\r\n"));
System.out.println("词库说明:" + QQPinyinQpydReader.substringBetween(dataString, "Intro: ", "\r\n"));
System.out.println("词库样例:" + QQPinyinQpydReader.substringBetween(dataString, "Example: ", "\r\n"));
System.out.println("词条数:" + dataRawBytes.getInt(0x44));
// read zipped qqyd dictionary into byte array
dataOut.reset();
try (InflaterOutputStream inflater = new InflaterOutputStream(dataOut);) {
Channels.newChannel(inflater).write(ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength));
}
// uncompressed qqyd dictionary as bytes
final ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray());
dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN);
// for debugging: save unzipped data to *.unzipped file
try (FileOutputStream out = new FileOutputStream(qqydFile + ".unzipped");) {
Channels.newChannel(out).write(dataUnzippedBytes);
System.out.println("压缩数据:0x" + Integer.toHexString(startZippedDictAddr) + " (解压前:" + zippedDictLength + " B, 解压后:" + dataUnzippedBytes.limit() + " B)");
}
// stores the start address of actual dictionary data
int unzippedDictStartAddr = -1;
final byte[] byteArray = dataUnzippedBytes.array();
dataUnzippedBytes.position(0);
while ((unzippedDictStartAddr == -1) || (dataUnzippedBytes.position() < unzippedDictStartAddr)) {
// read word
final int pinyinLength = dataUnzippedBytes.get() & 0xff;
final int wordLength = dataUnzippedBytes.get() & 0xff;
dataUnzippedBytes.getInt(); // garbage
final int pinyinStartAddr = dataUnzippedBytes.getInt();
final int wordStartAddr = pinyinStartAddr + pinyinLength;
if (unzippedDictStartAddr == -1) {
unzippedDictStartAddr = pinyinStartAddr;
System.out.println("词库地址(解压后):0x" + Integer.toHexString(unzippedDictStartAddr) + "\n");
}
final String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength), "UTF-8");
final String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), "UTF-16LE");
System.out.println(word + "\t" + pinyin);
}
}
public static final String substringBetween(final String text, final String start, final String end) {
final int nStart = text.indexOf(start);
final int nEnd = text.indexOf(end, nStart + 1);
if ((nStart != -1) && (nEnd != -1)) {
return text.substring(nStart + start.length(), nEnd);
} else {
return null;
}
}
}
/* 搜狗拼音输入法SCEL词库文件解析
* Copyright (c) 2010 Xiaoyun Zhu
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
/**
* Sougou Pinyin IME SCEL File Reader
*
* <pre>
* SCEL Format overview:
*
* General Information:
* - Chinese characters and pinyin are all encoded with UTF-16LE.
* - Numbers are using little endian byte order.
*
* SCEL hex analysis:
* - 0x0 Pinyin List Offset
* - 0x120 total number of words
* - 0x<PY-Offset> total number of pinyin
* - ... List of pinyin as [index, byte length of pinyin, pinyin as string] triples
* - ... Dictionary
* - ... <additional garbage>
*
* Dictionary format:
* - It can interpreted as a list of
* [alternatives of words,
* byte length of pinyin indexes, pinyin indexes,
* [byte length of word, word as string, length of skip bytes, skip bytes]
* ... (alternatives)
* ].
*
* </pre>
*
*/
class SogouScelReader {
public static void main(final String[] args) throws IOException {
// download from http://pinyin.sogou.com/dict
final String scelFile = args[0];
// read scel into byte array
final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
try (RandomAccessFile file = new RandomAccessFile(scelFile, "r"); final FileChannel fChannel = file.getChannel();) {
fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
}
// scel as bytes
final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
System.out.println("文件: " + scelFile);
final byte[] buf = new byte[1024];
final String[] pyDict = new String[512];
final int totalWords = dataRawBytes.getInt(0x120);
// pinyin offset
dataRawBytes.position(dataRawBytes.getInt());
final int totalPinyin = dataRawBytes.getInt();
for (int i = 0; i < totalPinyin; i++) {
final int idx = dataRawBytes.getShort();
final int len = dataRawBytes.getShort();
dataRawBytes.get(buf, 0, len);
pyDict[idx] = new String(buf, 0, len, "UTF-16LE");
}
// extract dictionary
int counter = 0;
for (int i = 0; i < totalWords; i++) {
final StringBuilder py = new StringBuilder();
final StringBuilder word = new StringBuilder();
int alternatives = dataRawBytes.getShort();
int pyLength = dataRawBytes.getShort() / 2;
boolean first = true;
while (pyLength-- > 0) {
final int key = dataRawBytes.getShort();
if (first) {
first = false;
} else {
py.append('\'');
}
py.append(pyDict[key]);
}
first = true;
while (alternatives-- > 0) {
if (first) {
first = false;
} else {
word.append(", ");
}
final int wordlength = dataRawBytes.getShort();
dataRawBytes.get(buf, 0, wordlength);
word.append(new String(buf, 0, wordlength, "UTF-16LE"));
// skip bytes
dataRawBytes.get(buf, 0, dataRawBytes.getShort());
}
System.out.println(word.toString() + "\t" + py.toString());
counter++;
}
System.out.println("\n读出词汇'" + scelFile + "': " + counter);
}
}
@yourtion
Copy link
Author

USE:

$ javac SogouScelReader
$ java SogouScelReader 饮食大全.scel

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment