yourtion/BaiduBcdReader.java

## BaiduBcdReader.java
/* 百度拼音输入法BCD词库文件解析
 * Copyright (c) 2010 Xiaoyun Zhu
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in
 *  all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 *  THE SOFTWARE.
 */
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;

import cn.kk.kkdict.utils.Helper;

/**
 * Baidu Pinyin IME BDICT File Reader
 *
 * <pre>
 * BDICT Format overview:
 *
 * General Information:
 * - Chinese characters and pinyin are all encoded with UTF-16LE.
 * - Numbers are using little endian byte order.
 *
 * BDICT hex analysis:
 * - 0x250         total number of words
 * - 0x350         dictionary offset
 * - 0x<Offset>    Dictionary
 *
 * Dictionary format:
 * - It can interpreted as a list of
 *   [amount of characters (short not integer!)
 *       pinyin construction using fenmu and yunmu,
 *       word as string
 *   ].
 *
 * </pre>
 *
 * @author keke
 */
public class BaiduBcdReader {
  private static final String[] FEN_MU = { "c", "d", "b", "f", "g", "h", "ch", "j", "k", "l", "m", "n", "", "p", "q", "r", "s", "t", "sh", "zh", "w", "x", "y",
      "z"                             };
  private static final String[] YUN_MU = { "uang", "iang", "ong", "ang", "eng", "ian", "iao", "ing", "ong", "uai", "uan", "ai", "an", "ao", "ei", "en", "er",
      "ua", "ie", "in", "iu", "ou", "ia", "ue", "ui", "un", "uo", "a", "e", "i", "a", "u", "v" };

  public static void main(final String[] args) {
    // download from http://r6.mo.baidu.com/web/iw/index/
    final String bdictFile = args[0];

    BaiduBcdReader.analyze(bdictFile);
  }

  @SuppressWarnings("resource")
  private static void analyze(final String bdictFile) {
    // read bdict into byte array
    RandomAccessFile file = null;
    try {
      file = new RandomAccessFile(bdictFile, "r");

      final FileChannel fChannel = file.getChannel();
      final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
      fChannel.read(dataRawBytes);
      dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
      dataRawBytes.rewind();
      fChannel.close();

      System.out.println("文件: " + bdictFile);

      final byte[] buf = new byte[1024];
      final int total = dataRawBytes.getInt(0x250);
      // dictionary offset
      dataRawBytes.position(0x350);
      for (int i = 0; i < total; i++) {
        final int length = dataRawBytes.getShort();
        dataRawBytes.getShort();
        boolean first = true;
        final StringBuilder pinyin = new StringBuilder();
        for (int j = 0; j < length; j++) {
          if (first) {
            first = false;
          } else {
            pinyin.append('\'');
          }
          pinyin.append(BaiduBcdReader.FEN_MU[dataRawBytes.get()] + BaiduBcdReader.YUN_MU[dataRawBytes.get()]);
        }
        dataRawBytes.get(buf, 0, 2 * length);
        final String word = new String(buf, 0, 2 * length, "UTF-16LE");
        System.out.println(word + "\t" + pinyin);
      }

      System.out.println("\nExtracted '" + bdictFile + "': " + total);
    } catch (IOException e) {
      System.err.println("Error: " + e);
    } finally {
      Helper.close(file);
    }
  }
}

## LingoesLd2Reader.java
/*  Lingoes灵格斯电子词典LD2(LDF)文件解析
 *  Copyright (c) 2010 Xiaoyun Zhu
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in
 *  all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 *  THE SOFTWARE.
 */

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

/**
 * Lingoes LD2/LDF File Reader
 *
 * <pre>
 * Lingoes Format overview:
 *
 * General Information:
 * - Dictionary data are stored in deflate streams.
 * - Index group information is stored in an index array in the LD2 file itself.
 * - Numbers are using little endian byte order.
 * - Definitions and xml data have UTF-8 or UTF-16LE encodings.
 *
 * LD2 file schema:
 * - File Header
 * - File Description
 * - Additional Information (optional)
 * - Index Group (corresponds to definitions in dictionary)
 * - Deflated Dictionary Streams
 * -- Index Data
 * --- Offsets of definitions
 * --- Offsets of translations
 * --- Flags
 * --- References to other translations
 * -- Definitions
 * -- Translations (xml)
 *
 * TODO: find encoding / language fields to replace auto-detect of encodings
 *
 * </pre>
 *
 * @author keke
 *
 */
public class LingoesLd2Reader {
  private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")),
      new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")),
      new SensitiveStringDecoder(Charset.forName("EUC-JP"))    };

  public static void main(final String[] args) throws IOException {
    // download from
    // https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents
    // String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2";
    final String ld2File = args[0];

    // read lingoes ld2 into byte array
    final ByteBuffer dataRawBytes;
    try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) {
      dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
      fChannel.read(dataRawBytes);
    }
    dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
    dataRawBytes.rewind();

    System.out.println("文件：" + ld2File);
    System.out.println("类型：" + new String(dataRawBytes.array(), 0, 4, "ASCII"));
    System.out.println("版本：" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A));
    System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));

    final int offsetData = dataRawBytes.getInt(0x5C) + 0x60;
    if (dataRawBytes.limit() > offsetData) {
      System.out.println("简介地址：0x" + Integer.toHexString(offsetData));
      final int type = dataRawBytes.getInt(offsetData);
      System.out.println("简介类型：0x" + Integer.toHexString(type));
      final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12;
      if (type == 3) {
        // without additional information
        LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData);
      } else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) {
        LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo);
      } else {
        System.err.println("文件不包含字典数据。网上字典？");
      }
    } else {
      System.err.println("文件不包含字典数据。网上字典？");
    }
  }

  private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append)
      throws IOException {
    final Inflater inflator = new Inflater();
    try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8);
        final FileOutputStream out = new FileOutputStream(inflatedFile, append);) {
      LingoesLd2Reader.writeInputStream(in, out);
    }
    final long bytesRead = inflator.getBytesRead();
    inflator.end();
    return bytesRead;
  }

  private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal,
      final int dataLen, final int[] idxData, final String[] defData) {
    final int test = Math.min(defTotal, 10);
    for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) {
      for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) {
        try {
          for (int i = 0; i < test; i++) {
            LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j],
                LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i);
          }
          System.out.println("词组编码：" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name);
          System.out.println("XML编码：" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name);
          return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] };
        } catch (final Throwable e) {
          // ignore
        }
      }
    }
    System.err.println("自动识别编码失败！选择UTF-16LE继续。");
    return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] };
  }

  private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile,
      final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException,
      UnsupportedEncodingException {
    System.out.println("写入'" + extractedOutputFile + "'。。。");

    int counter = 0;
    try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r");
        final FileWriter indexWriter = new FileWriter(indexFile);
        final FileWriter defsWriter = new FileWriter(extractedWordsFile);
        final FileWriter xmlWriter = new FileWriter(extractedXmlFile);
        final FileWriter outputWriter = new FileWriter(extractedOutputFile);
        // read inflated data
        final FileChannel fChannel = file.getChannel();) {
      final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
      fChannel.read(dataRawBytes);
      fChannel.close();
      dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
      dataRawBytes.rewind();

      final int dataLen = 10;
      final int defTotal = (offsetDefs / dataLen) - 1;

      final String[] words = new String[defTotal];
      final int[] idxData = new int[6];
      final String[] defData = new String[2];

      final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData);

      dataRawBytes.position(8);

      for (int i = 0; i < defTotal; i++) {
        LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i);

        words[i] = defData[0];
        defsWriter.write(defData[0]);
        defsWriter.write("\n");

        xmlWriter.write(defData[1]);
        xmlWriter.write("\n");

        outputWriter.write(defData[0]);
        outputWriter.write("=");
        outputWriter.write(defData[1]);
        outputWriter.write("\n");

        System.out.println(defData[0] + " = " + defData[1]);
        counter++;
      }

      for (int i = 0; i < idxArray.length; i++) {
        final int idx = idxArray[i];
        indexWriter.write(words[idx]);
        indexWriter.write(", ");
        indexWriter.write(String.valueOf(idx));
        indexWriter.write("\n");
      }
    }
    System.out.println("成功读出" + counter + "组数据。");
  }

  private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) {
    dataRawBytes.position(position);
    wordIdxData[0] = dataRawBytes.getInt();
    wordIdxData[1] = dataRawBytes.getInt();
    wordIdxData[2] = dataRawBytes.get() & 0xff;
    wordIdxData[3] = dataRawBytes.get() & 0xff;
    wordIdxData[4] = dataRawBytes.getInt();
    wordIdxData[5] = dataRawBytes.getInt();
  }

  private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) {
    System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。");
    final int startOffset = dataRawBytes.position();
    int offset = -1;
    int lastOffset = startOffset;
    boolean append = false;
    try {
      for (final Integer offsetRelative : deflateStreams) {
        offset = startOffset + offsetRelative.intValue();
        LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append);
        append = true;
        lastOffset = offset;
      }
    } catch (final Throwable e) {
      System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString());
    }
  }

  private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen,
      final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) {
    LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData);
    int lastWordPos = idxData[0];
    int lastXmlPos = idxData[1];
    // final int flags = idxData[2];
    int refs = idxData[3];
    final int currentWordOffset = idxData[4];
    int currenXmlOffset = idxData[5];

    String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
    while (refs-- > 0) {
      final int ref = inflatedBytes.getInt(offsetWords + lastWordPos);
      LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData);
      lastXmlPos = idxData[1];
      currenXmlOffset = idxData[5];
      if (xml.isEmpty()) {
        xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
      } else {
        xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", "
            + xml;
      }
      lastWordPos += 4;
    }
    defData[1] = xml;

    final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos));
    defData[0] = word;
  }

  private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException,
      FileNotFoundException, UnsupportedEncodingException {
    System.out.println("词典类型：0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex)));
    final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8;
    final int offsetIndex = offsetWithIndex + 0x1C;
    final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex;
    final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12);
    final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16);
    final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20);
    final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4;
    final List<Integer> deflateStreams = new ArrayList<>();
    dataRawBytes.position(offsetCompressedDataHeader + 8);
    int offset = dataRawBytes.getInt();
    while ((offset + dataRawBytes.position()) < limit) {
      offset = dataRawBytes.getInt();
      deflateStreams.add(Integer.valueOf(offset));
    }
    final int offsetCompressedData = dataRawBytes.position();
    System.out.println("索引词组数目：" + definitions);
    System.out.println("索引地址/大小：0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B");
    System.out.println("压缩数据地址/大小：0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B");
    System.out.println("词组索引地址/大小（解压缩后）：0x0 / " + inflatedWordsIndexLength + " B");
    System.out.println("词组地址/大小（解压缩后）：0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B");
    System.out.println("XML地址/大小（解压缩后）：0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B");
    System.out.println("文件大小（解压缩后）：" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB");
    final String inflatedFile = ld2File + ".inflated";
    LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile);

    if (new File(inflatedFile).isFile()) {
      final String indexFile = ld2File + ".idx";
      final String extractedFile = ld2File + ".words";
      final String extractedXmlFile = ld2File + ".xml";
      final String extractedOutputFile = ld2File + ".output";

      dataRawBytes.position(offsetIndex);
      final int[] idxArray = new int[definitions];
      for (int i = 0; i < definitions; i++) {
        idxArray[i] = dataRawBytes.getInt();
      }
      LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength,
          inflatedWordsIndexLength + inflatedWordsLength);
    }
  }

  private static final String strip(final String xml) {
    int open = 0;
    int end = 0;
    if ((open = xml.indexOf("<![CDATA[")) != -1) {
      if ((end = xml.indexOf("]]>", open)) != -1) {
        return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
      }
    } else if ((open = xml.indexOf("<Ô")) != -1) {
      if ((end = xml.indexOf("</Ô", open)) != -1) {
        open = xml.indexOf(">", open + 1);
        return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
      }
    } else {
      final StringBuilder sb = new StringBuilder();
      end = 0;
      open = xml.indexOf('<');
      do {
        if ((open - end) > 1) {
          sb.append(xml.substring(end + 1, open));
        }
        open = xml.indexOf('<', open + 1);
        end = xml.indexOf('>', end + 1);
      } while ((open != -1) && (end != -1));
      return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
    }
    return "";
  }

  private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException {
    final byte[] buffer = new byte[1024 * 8];
    int len;
    while ((len = in.read(buffer)) > 0) {
      out.write(buffer, 0, len);
    }
  }

  private static class SensitiveStringDecoder {
    public final String          name;
    private final CharsetDecoder cd;

    SensitiveStringDecoder(final Charset cs) {
      this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
      this.name = cs.name();
    }

    char[] decode(final byte[] ba, final int off, final int len) {
      final int en = (int) (len * (double) this.cd.maxCharsPerByte());
      final char[] ca = new char[en];
      if (len == 0) {
        return ca;
      }
      this.cd.reset();
      final ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
      final CharBuffer cb = CharBuffer.wrap(ca);
      try {
        CoderResult cr = this.cd.decode(bb, cb, true);
        if (!cr.isUnderflow()) {
          cr.throwException();
        }
        cr = this.cd.flush(cb);
        if (!cr.isUnderflow()) {
          cr.throwException();
        }
      } catch (final CharacterCodingException x) {
        // Substitution is always enabled,
        // so this shouldn't happen
        throw new Error(x);
      }
      return SensitiveStringDecoder.safeTrim(ca, cb.position());
    }

    private static char[] safeTrim(final char[] ca, final int len) {
      if (len == ca.length) {
        return ca;
      } else {
        return Arrays.copyOf(ca, len);
      }
    }
  }
}

## QQPinyinQpydReader.java
/* QQ拼音qpyd词库文件解析
 * Copyright (c) 2010 Xiaoyun Zhu
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in
 *  all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 *  THE SOFTWARE.
 */

import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.zip.InflaterOutputStream;

/**
 * QQ Pinyin IME QPYD File Reader
 *
 * <pre>
 * QPYD Format overview:
 *
 * General Information:
 * - Chinese characters are all encoded with UTF-16LE.
 * - Pinyin are encoded in ascii (or UTF-8).
 * - Numbers are using little endian byte order.
 *
 * QPYD hex analysis:
 * - 0x00 QPYD file identifier
 * - 0x38 offset of compressed data (word-pinyin-dictionary)
 * - 0x44 total words in qpyd
 * - 0x60 start of header information
 *
 * Compressed data analysis:
 * - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files
 * - data is divided in two parts
 * -- 1. offset and length information (16 bytes for each pinyin-word pair)
 *       0x06 offset points to first pinyin
 *       0x00 length of pinyin
 *       0x01 length of word
 * -- 2. actual data
 *       Dictionary data has the form ((pinyin)(word))* with no separators.
 *       Data can only be read using offset and length information.
 *
 * </pre>
 *
 */
public class QQPinyinQpydReader {
  public static void main(final String[] args) throws IOException {
    // download from http://dict.py.qq.com/list.php
    final String qqydFile = args[0];

    // read qpyd into byte array
    final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();

    try (RandomAccessFile file = new RandomAccessFile(qqydFile, "r"); final FileChannel fChannel = file.getChannel();) {
      fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
    }

    // qpyd as bytes
    final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
    dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

    System.out.println("文件: " + qqydFile);

    // read info of compressed data
    final int startZippedDictAddr = dataRawBytes.getInt(0x38);
    final int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr;

    // qpys as UTF-16LE string
    final String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE");

    // print header
    System.out.println("名称：" + QQPinyinQpydReader.substringBetween(dataString, "Name: ", "\r\n"));
    System.out.println("类型：" + QQPinyinQpydReader.substringBetween(dataString, "Type: ", "\r\n"));
    System.out.println("子类型：" + QQPinyinQpydReader.substringBetween(dataString, "FirstType: ", "\r\n"));
    System.out.println("词库说明：" + QQPinyinQpydReader.substringBetween(dataString, "Intro: ", "\r\n"));
    System.out.println("词库样例：" + QQPinyinQpydReader.substringBetween(dataString, "Example: ", "\r\n"));
    System.out.println("词条数：" + dataRawBytes.getInt(0x44));

    // read zipped qqyd dictionary into byte array
    dataOut.reset();
    try (InflaterOutputStream inflater = new InflaterOutputStream(dataOut);) {
      Channels.newChannel(inflater).write(ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength));
    }

    // uncompressed qqyd dictionary as bytes
    final ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray());
    dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN);

    // for debugging: save unzipped data to *.unzipped file
    try (FileOutputStream out = new FileOutputStream(qqydFile + ".unzipped");) {
      Channels.newChannel(out).write(dataUnzippedBytes);
      System.out.println("压缩数据：0x" + Integer.toHexString(startZippedDictAddr) + " (解压前：" + zippedDictLength + " B, 解压后：" + dataUnzippedBytes.limit() + " B)");
    }

    // stores the start address of actual dictionary data
    int unzippedDictStartAddr = -1;
    final byte[] byteArray = dataUnzippedBytes.array();
    dataUnzippedBytes.position(0);
    while ((unzippedDictStartAddr == -1) || (dataUnzippedBytes.position() < unzippedDictStartAddr)) {
      // read word
      final int pinyinLength = dataUnzippedBytes.get() & 0xff;
      final int wordLength = dataUnzippedBytes.get() & 0xff;
      dataUnzippedBytes.getInt(); // garbage
      final int pinyinStartAddr = dataUnzippedBytes.getInt();
      final int wordStartAddr = pinyinStartAddr + pinyinLength;

      if (unzippedDictStartAddr == -1) {
        unzippedDictStartAddr = pinyinStartAddr;
        System.out.println("词库地址（解压后）：0x" + Integer.toHexString(unzippedDictStartAddr) + "\n");
      }

      final String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength), "UTF-8");
      final String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), "UTF-16LE");
      System.out.println(word + "\t" + pinyin);
    }
  }

  public static final String substringBetween(final String text, final String start, final String end) {
    final int nStart = text.indexOf(start);
    final int nEnd = text.indexOf(end, nStart + 1);
    if ((nStart != -1) && (nEnd != -1)) {
      return text.substring(nStart + start.length(), nEnd);
    } else {
      return null;
    }
  }
}

## SogouScelReader.java
/* 搜狗拼音输入法SCEL词库文件解析
 * Copyright (c) 2010 Xiaoyun Zhu
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in
 *  all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 *  THE SOFTWARE.
 */

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;

/**
 * Sougou Pinyin IME SCEL File Reader
 *
 * <pre>
 * SCEL Format overview:
 *
 * General Information:
 * - Chinese characters and pinyin are all encoded with UTF-16LE.
 * - Numbers are using little endian byte order.
 *
 * SCEL hex analysis:
 * - 0x0           Pinyin List Offset
 * - 0x120         total number of words
 * - 0x<PY-Offset> total number of pinyin
 * - ...           List of pinyin as [index, byte length of pinyin, pinyin as string] triples
 * - ...           Dictionary
 * - ...           <additional garbage>
 *
 * Dictionary format:
 * - It can interpreted as a list of
 *   [alternatives of words,
 *       byte length of pinyin indexes, pinyin indexes,
 *       [byte length of word, word as string, length of skip bytes, skip bytes]
 *       ... (alternatives)
 *   ].
 *
 * </pre>
 *
 */
class SogouScelReader {
 	public static void main(final String[] args) throws IOException {

		// download from http://pinyin.sogou.com/dict
		final String scelFile = args[0];

		// read scel into byte array
		final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();

		try (RandomAccessFile file = new RandomAccessFile(scelFile, "r"); final FileChannel fChannel = file.getChannel();) {
			fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
		}

		// scel as bytes
		final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
		dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

		System.out.println("文件: " + scelFile);

		final byte[] buf = new byte[1024];
		final String[] pyDict = new String[512];

		final int totalWords = dataRawBytes.getInt(0x120);

		// pinyin offset
		dataRawBytes.position(dataRawBytes.getInt());
		final int totalPinyin = dataRawBytes.getInt();
		for (int i = 0; i < totalPinyin; i++) {
			final int idx = dataRawBytes.getShort();
			final int len = dataRawBytes.getShort();
			dataRawBytes.get(buf, 0, len);
			pyDict[idx] = new String(buf, 0, len, "UTF-16LE");
		}

		// extract dictionary
		int counter = 0;
		for (int i = 0; i < totalWords; i++) {
			final StringBuilder py = new StringBuilder();
			final StringBuilder word = new StringBuilder();

			int alternatives = dataRawBytes.getShort();
			int pyLength = dataRawBytes.getShort() / 2;
			boolean first = true;
			while (pyLength-- > 0) {
				final int key = dataRawBytes.getShort();
				if (first) {
					first = false;
				} else {
					py.append('\'');
				}
				py.append(pyDict[key]);
			}
			first = true;
			while (alternatives-- > 0) {
				if (first) {
					first = false;
				} else {
					word.append(", ");
				}
				final int wordlength = dataRawBytes.getShort();
				dataRawBytes.get(buf, 0, wordlength);
				word.append(new String(buf, 0, wordlength, "UTF-16LE"));
				// skip bytes
				dataRawBytes.get(buf, 0, dataRawBytes.getShort());
			}
			System.out.println(word.toString() + "\t" + py.toString());
			counter++;
		}
		System.out.println("\n读出词汇'" + scelFile + "': " + counter);
	}
}
	/* 百度拼音输入法BCD词库文件解析
	* Copyright (c) 2010 Xiaoyun Zhu
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.nio.channels.FileChannel;

	import cn.kk.kkdict.utils.Helper;

	/**
	* Baidu Pinyin IME BDICT File Reader
	*
	* <pre>
	* BDICT Format overview:
	*
	* General Information:
	* - Chinese characters and pinyin are all encoded with UTF-16LE.
	* - Numbers are using little endian byte order.
	*
	* BDICT hex analysis:
	* - 0x250 total number of words
	* - 0x350 dictionary offset
	* - 0x<Offset> Dictionary
	*
	* Dictionary format:
	* - It can interpreted as a list of
	* [amount of characters (short not integer!)
	* pinyin construction using fenmu and yunmu,
	* word as string
	* ].
	*
	* </pre>
	*
	* @author keke
	*/
	public class BaiduBcdReader {
	private static final String[] FEN_MU = { "c", "d", "b", "f", "g", "h", "ch", "j", "k", "l", "m", "n", "", "p", "q", "r", "s", "t", "sh", "zh", "w", "x", "y",
	"z" };
	private static final String[] YUN_MU = { "uang", "iang", "ong", "ang", "eng", "ian", "iao", "ing", "ong", "uai", "uan", "ai", "an", "ao", "ei", "en", "er",
	"ua", "ie", "in", "iu", "ou", "ia", "ue", "ui", "un", "uo", "a", "e", "i", "a", "u", "v" };

	public static void main(final String[] args) {
	// download from http://r6.mo.baidu.com/web/iw/index/
	final String bdictFile = args[0];

	BaiduBcdReader.analyze(bdictFile);
	}

	@SuppressWarnings("resource")
	private static void analyze(final String bdictFile) {
	// read bdict into byte array
	RandomAccessFile file = null;
	try {
	file = new RandomAccessFile(bdictFile, "r");

	final FileChannel fChannel = file.getChannel();
	final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
	fChannel.read(dataRawBytes);
	dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
	dataRawBytes.rewind();
	fChannel.close();

	System.out.println("文件: " + bdictFile);

	final byte[] buf = new byte[1024];
	final int total = dataRawBytes.getInt(0x250);
	// dictionary offset
	dataRawBytes.position(0x350);
	for (int i = 0; i < total; i++) {
	final int length = dataRawBytes.getShort();
	dataRawBytes.getShort();
	boolean first = true;
	final StringBuilder pinyin = new StringBuilder();
	for (int j = 0; j < length; j++) {
	if (first) {
	first = false;
	} else {
	pinyin.append('\'');
	}
	pinyin.append(BaiduBcdReader.FEN_MU[dataRawBytes.get()] + BaiduBcdReader.YUN_MU[dataRawBytes.get()]);
	}
	dataRawBytes.get(buf, 0, 2 * length);
	final String word = new String(buf, 0, 2 * length, "UTF-16LE");
	System.out.println(word + "\t" + pinyin);
	}

	System.out.println("\nExtracted '" + bdictFile + "': " + total);
	} catch (IOException e) {
	System.err.println("Error: " + e);
	} finally {
	Helper.close(file);
	}
	}
	}
	/* Lingoes灵格斯电子词典LD2(LDF)文件解析
	* Copyright (c) 2010 Xiaoyun Zhu
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/

	import java.io.ByteArrayInputStream;
	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.FileOutputStream;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.io.RandomAccessFile;
	import java.io.UnsupportedEncodingException;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.nio.CharBuffer;
	import java.nio.channels.FileChannel;
	import java.nio.charset.CharacterCodingException;
	import java.nio.charset.Charset;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CoderResult;
	import java.nio.charset.CodingErrorAction;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.zip.Inflater;
	import java.util.zip.InflaterInputStream;

	/**
	* Lingoes LD2/LDF File Reader
	*
	* <pre>
	* Lingoes Format overview:
	*
	* General Information:
	* - Dictionary data are stored in deflate streams.
	* - Index group information is stored in an index array in the LD2 file itself.
	* - Numbers are using little endian byte order.
	* - Definitions and xml data have UTF-8 or UTF-16LE encodings.
	*
	* LD2 file schema:
	* - File Header
	* - File Description
	* - Additional Information (optional)
	* - Index Group (corresponds to definitions in dictionary)
	* - Deflated Dictionary Streams
	* -- Index Data
	* --- Offsets of definitions
	* --- Offsets of translations
	* --- Flags
	* --- References to other translations
	* -- Definitions
	* -- Translations (xml)
	*
	* TODO: find encoding / language fields to replace auto-detect of encodings
	*
	* </pre>
	*
	* @author keke
	*
	*/
	public class LingoesLd2Reader {
	private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { new SensitiveStringDecoder(Charset.forName("UTF-8")),
	new SensitiveStringDecoder(Charset.forName("UTF-16LE")), new SensitiveStringDecoder(Charset.forName("UTF-16BE")),
	new SensitiveStringDecoder(Charset.forName("EUC-JP")) };

	public static void main(final String[] args) throws IOException {
	// download from
	// https://skydrive.live.com/?cid=a10100d37adc7ad3&sc=documents&id=A10100D37ADC7AD3%211172#cid=A10100D37ADC7AD3&sc=documents
	// String ld2File = Helper.DIR_IN_DICTS+"\\lingoes\\Prodic English-Vietnamese Business.ld2";
	final String ld2File = args[0];

	// read lingoes ld2 into byte array
	final ByteBuffer dataRawBytes;
	try (RandomAccessFile file = new RandomAccessFile(ld2File, "r"); final FileChannel fChannel = file.getChannel();) {
	dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
	fChannel.read(dataRawBytes);
	}
	dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
	dataRawBytes.rewind();

	System.out.println("文件：" + ld2File);
	System.out.println("类型：" + new String(dataRawBytes.array(), 0, 4, "ASCII"));
	System.out.println("版本：" + dataRawBytes.getShort(0x18) + "." + dataRawBytes.getShort(0x1A));
	System.out.println("ID: 0x" + Long.toHexString(dataRawBytes.getLong(0x1C)));

	final int offsetData = dataRawBytes.getInt(0x5C) + 0x60;
	if (dataRawBytes.limit() > offsetData) {
	System.out.println("简介地址：0x" + Integer.toHexString(offsetData));
	final int type = dataRawBytes.getInt(offsetData);
	System.out.println("简介类型：0x" + Integer.toHexString(type));
	final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) + offsetData + 12;
	if (type == 3) {
	// without additional information
	LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetData);
	} else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) {
	LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, offsetWithInfo);
	} else {
	System.err.println("文件不包含字典数据。网上字典？");
	}
	} else {
	System.err.println("文件不包含字典数据。网上字典？");
	}
	}

	private static final long decompress(final String inflatedFile, final ByteBuffer data, final int offset, final int length, final boolean append)
	throws IOException {
	final Inflater inflator = new Inflater();
	try (final InflaterInputStream in = new InflaterInputStream(new ByteArrayInputStream(data.array(), offset, length), inflator, 1024 * 8);
	final FileOutputStream out = new FileOutputStream(inflatedFile, append);) {
	LingoesLd2Reader.writeInputStream(in, out);
	}
	final long bytesRead = inflator.getBytesRead();
	inflator.end();
	return bytesRead;
	}

	private static final SensitiveStringDecoder[] detectEncodings(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int defTotal,
	final int dataLen, final int[] idxData, final String[] defData) {
	final int test = Math.min(defTotal, 10);
	for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) {
	for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) {
	try {
	for (int i = 0; i < test; i++) {
	LingoesLd2Reader.readDefinitionData(inflatedBytes, offsetWords, offsetXml, dataLen, LingoesLd2Reader.AVAIL_ENCODINGS[j],
	LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, defData, i);
	}
	System.out.println("词组编码：" + LingoesLd2Reader.AVAIL_ENCODINGS[j].name);
	System.out.println("XML编码：" + LingoesLd2Reader.AVAIL_ENCODINGS[k].name);
	return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[j], LingoesLd2Reader.AVAIL_ENCODINGS[k] };
	} catch (final Throwable e) {
	// ignore
	}
	}
	}
	System.err.println("自动识别编码失败！选择UTF-16LE继续。");
	return new SensitiveStringDecoder[] { LingoesLd2Reader.AVAIL_ENCODINGS[1], LingoesLd2Reader.AVAIL_ENCODINGS[1] };
	}

	private static final void extract(final String inflatedFile, final String indexFile, final String extractedWordsFile, final String extractedXmlFile,
	final String extractedOutputFile, final int[] idxArray, final int offsetDefs, final int offsetXml) throws IOException, FileNotFoundException,
	UnsupportedEncodingException {
	System.out.println("写入'" + extractedOutputFile + "'。。。");

	int counter = 0;
	try (RandomAccessFile file = new RandomAccessFile(inflatedFile, "r");
	final FileWriter indexWriter = new FileWriter(indexFile);
	final FileWriter defsWriter = new FileWriter(extractedWordsFile);
	final FileWriter xmlWriter = new FileWriter(extractedXmlFile);
	final FileWriter outputWriter = new FileWriter(extractedOutputFile);
	// read inflated data
	final FileChannel fChannel = file.getChannel();) {
	final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size());
	fChannel.read(dataRawBytes);
	fChannel.close();
	dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);
	dataRawBytes.rewind();

	final int dataLen = 10;
	final int defTotal = (offsetDefs / dataLen) - 1;

	final String[] words = new String[defTotal];
	final int[] idxData = new int[6];
	final String[] defData = new String[2];

	final SensitiveStringDecoder[] encodings = LingoesLd2Reader.detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData);

	dataRawBytes.position(8);

	for (int i = 0; i < defTotal; i++) {
	LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, offsetXml, dataLen, encodings[0], encodings[1], idxData, defData, i);

	words[i] = defData[0];
	defsWriter.write(defData[0]);
	defsWriter.write("\n");

	xmlWriter.write(defData[1]);
	xmlWriter.write("\n");

	outputWriter.write(defData[0]);
	outputWriter.write("=");
	outputWriter.write(defData[1]);
	outputWriter.write("\n");

	System.out.println(defData[0] + " = " + defData[1]);
	counter++;
	}

	for (int i = 0; i < idxArray.length; i++) {
	final int idx = idxArray[i];
	indexWriter.write(words[idx]);
	indexWriter.write(", ");
	indexWriter.write(String.valueOf(idx));
	indexWriter.write("\n");
	}
	}
	System.out.println("成功读出" + counter + "组数据。");
	}

	private static final void getIdxData(final ByteBuffer dataRawBytes, final int position, final int[] wordIdxData) {
	dataRawBytes.position(position);
	wordIdxData[0] = dataRawBytes.getInt();
	wordIdxData[1] = dataRawBytes.getInt();
	wordIdxData[2] = dataRawBytes.get() & 0xff;
	wordIdxData[3] = dataRawBytes.get() & 0xff;
	wordIdxData[4] = dataRawBytes.getInt();
	wordIdxData[5] = dataRawBytes.getInt();
	}

	private static final void inflate(final ByteBuffer dataRawBytes, final List<Integer> deflateStreams, final String inflatedFile) {
	System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" + inflatedFile + "'。。。");
	final int startOffset = dataRawBytes.position();
	int offset = -1;
	int lastOffset = startOffset;
	boolean append = false;
	try {
	for (final Integer offsetRelative : deflateStreams) {
	offset = startOffset + offsetRelative.intValue();
	LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, lastOffset, offset - lastOffset, append);
	append = true;
	lastOffset = offset;
	}
	} catch (final Throwable e) {
	System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " + e.toString());
	}
	}

	private static final void readDefinitionData(final ByteBuffer inflatedBytes, final int offsetWords, final int offsetXml, final int dataLen,
	final SensitiveStringDecoder wordStringDecoder, final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, final String[] defData, final int i) {
	LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData);
	int lastWordPos = idxData[0];
	int lastXmlPos = idxData[1];
	// final int flags = idxData[2];
	int refs = idxData[3];
	final int currentWordOffset = idxData[4];
	int currenXmlOffset = idxData[5];

	String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
	while (refs-- > 0) {
	final int ref = inflatedBytes.getInt(offsetWords + lastWordPos);
	LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData);
	lastXmlPos = idxData[1];
	currenXmlOffset = idxData[5];
	if (xml.isEmpty()) {
	xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos)));
	} else {
	xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode(inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset - lastXmlPos))) + ", "
	+ xml;
	}
	lastWordPos += 4;
	}
	defData[1] = xml;

	final String word = new String(wordStringDecoder.decode(inflatedBytes.array(), offsetWords + lastWordPos, currentWordOffset - lastWordPos));
	defData[0] = word;
	}

	private static final void readDictionary(final String ld2File, final ByteBuffer dataRawBytes, final int offsetWithIndex) throws IOException,
	FileNotFoundException, UnsupportedEncodingException {
	System.out.println("词典类型：0x" + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex)));
	final int limit = dataRawBytes.getInt(offsetWithIndex + 4) + offsetWithIndex + 8;
	final int offsetIndex = offsetWithIndex + 0x1C;
	final int offsetCompressedDataHeader = dataRawBytes.getInt(offsetWithIndex + 8) + offsetIndex;
	final int inflatedWordsIndexLength = dataRawBytes.getInt(offsetWithIndex + 12);
	final int inflatedWordsLength = dataRawBytes.getInt(offsetWithIndex + 16);
	final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20);
	final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4;
	final List<Integer> deflateStreams = new ArrayList<>();
	dataRawBytes.position(offsetCompressedDataHeader + 8);
	int offset = dataRawBytes.getInt();
	while ((offset + dataRawBytes.position()) < limit) {
	offset = dataRawBytes.getInt();
	deflateStreams.add(Integer.valueOf(offset));
	}
	final int offsetCompressedData = dataRawBytes.position();
	System.out.println("索引词组数目：" + definitions);
	System.out.println("索引地址/大小：0x" + Integer.toHexString(offsetIndex) + " / " + (offsetCompressedDataHeader - offsetIndex) + " B");
	System.out.println("压缩数据地址/大小：0x" + Integer.toHexString(offsetCompressedData) + " / " + (limit - offsetCompressedData) + " B");
	System.out.println("词组索引地址/大小（解压缩后）：0x0 / " + inflatedWordsIndexLength + " B");
	System.out.println("词组地址/大小（解压缩后）：0x" + Integer.toHexString(inflatedWordsIndexLength) + " / " + inflatedWordsLength + " B");
	System.out.println("XML地址/大小（解压缩后）：0x" + Integer.toHexString(inflatedWordsIndexLength + inflatedWordsLength) + " / " + inflatedXmlLength + " B");
	System.out.println("文件大小（解压缩后）：" + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) + " KB");
	final String inflatedFile = ld2File + ".inflated";
	LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile);

	if (new File(inflatedFile).isFile()) {
	final String indexFile = ld2File + ".idx";
	final String extractedFile = ld2File + ".words";
	final String extractedXmlFile = ld2File + ".xml";
	final String extractedOutputFile = ld2File + ".output";

	dataRawBytes.position(offsetIndex);
	final int[] idxArray = new int[definitions];
	for (int i = 0; i < definitions; i++) {
	idxArray[i] = dataRawBytes.getInt();
	}
	LingoesLd2Reader.extract(inflatedFile, indexFile, extractedFile, extractedXmlFile, extractedOutputFile, idxArray, inflatedWordsIndexLength,
	inflatedWordsIndexLength + inflatedWordsLength);
	}
	}

	private static final String strip(final String xml) {
	int open = 0;
	int end = 0;
	if ((open = xml.indexOf("<![CDATA[")) != -1) {
	if ((end = xml.indexOf("]]>", open)) != -1) {
	return xml.substring(open + "<![CDATA[".length(), end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
	}
	} else if ((open = xml.indexOf("<Ô")) != -1) {
	if ((end = xml.indexOf("</Ô", open)) != -1) {
	open = xml.indexOf(">", open + 1);
	return xml.substring(open + 1, end).replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
	}
	} else {
	final StringBuilder sb = new StringBuilder();
	end = 0;
	open = xml.indexOf('<');
	do {
	if ((open - end) > 1) {
	sb.append(xml.substring(end + 1, open));
	}
	open = xml.indexOf('<', open + 1);
	end = xml.indexOf('>', end + 1);
	} while ((open != -1) && (end != -1));
	return sb.toString().replace('\t', ' ').replace('\n', ' ').replace('\u001e', ' ').replace('\u001f', ' ');
	}
	return "";
	}

	private static final void writeInputStream(final InputStream in, final OutputStream out) throws IOException {
	final byte[] buffer = new byte[1024 * 8];
	int len;
	while ((len = in.read(buffer)) > 0) {
	out.write(buffer, 0, len);
	}
	}

	private static class SensitiveStringDecoder {
	public final String name;
	private final CharsetDecoder cd;

	SensitiveStringDecoder(final Charset cs) {
	this.cd = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
	this.name = cs.name();
	}

	char[] decode(final byte[] ba, final int off, final int len) {
	final int en = (int) (len * (double) this.cd.maxCharsPerByte());
	final char[] ca = new char[en];
	if (len == 0) {
	return ca;
	}
	this.cd.reset();
	final ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
	final CharBuffer cb = CharBuffer.wrap(ca);
	try {
	CoderResult cr = this.cd.decode(bb, cb, true);
	if (!cr.isUnderflow()) {
	cr.throwException();
	}
	cr = this.cd.flush(cb);
	if (!cr.isUnderflow()) {
	cr.throwException();
	}
	} catch (final CharacterCodingException x) {
	// Substitution is always enabled,
	// so this shouldn't happen
	throw new Error(x);
	}
	return SensitiveStringDecoder.safeTrim(ca, cb.position());
	}

	private static char[] safeTrim(final char[] ca, final int len) {
	if (len == ca.length) {
	return ca;
	} else {
	return Arrays.copyOf(ca, len);
	}
	}
	}
	}
	/* QQ拼音qpyd词库文件解析
	* Copyright (c) 2010 Xiaoyun Zhu
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/

	import java.io.ByteArrayOutputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.nio.channels.Channels;
	import java.nio.channels.FileChannel;
	import java.util.Arrays;
	import java.util.zip.InflaterOutputStream;

	/**
	* QQ Pinyin IME QPYD File Reader
	*
	* <pre>
	* QPYD Format overview:
	*
	* General Information:
	* - Chinese characters are all encoded with UTF-16LE.
	* - Pinyin are encoded in ascii (or UTF-8).
	* - Numbers are using little endian byte order.
	*
	* QPYD hex analysis:
	* - 0x00 QPYD file identifier
	* - 0x38 offset of compressed data (word-pinyin-dictionary)
	* - 0x44 total words in qpyd
	* - 0x60 start of header information
	*
	* Compressed data analysis:
	* - zip/standard (beginning with 0x789C) is used in (all analyzed) qpyd files
	* - data is divided in two parts
	* -- 1. offset and length information (16 bytes for each pinyin-word pair)
	* 0x06 offset points to first pinyin
	* 0x00 length of pinyin
	* 0x01 length of word
	* -- 2. actual data
	* Dictionary data has the form ((pinyin)(word))* with no separators.
	* Data can only be read using offset and length information.
	*
	* </pre>
	*
	*/
	public class QQPinyinQpydReader {
	public static void main(final String[] args) throws IOException {
	// download from http://dict.py.qq.com/list.php
	final String qqydFile = args[0];

	// read qpyd into byte array
	final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();

	try (RandomAccessFile file = new RandomAccessFile(qqydFile, "r"); final FileChannel fChannel = file.getChannel();) {
	fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
	}

	// qpyd as bytes
	final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
	dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

	System.out.println("文件: " + qqydFile);

	// read info of compressed data
	final int startZippedDictAddr = dataRawBytes.getInt(0x38);
	final int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr;

	// qpys as UTF-16LE string
	final String dataString = new String(Arrays.copyOfRange(dataRawBytes.array(), 0x60, startZippedDictAddr), "UTF-16LE");

	// print header
	System.out.println("名称：" + QQPinyinQpydReader.substringBetween(dataString, "Name: ", "\r\n"));
	System.out.println("类型：" + QQPinyinQpydReader.substringBetween(dataString, "Type: ", "\r\n"));
	System.out.println("子类型：" + QQPinyinQpydReader.substringBetween(dataString, "FirstType: ", "\r\n"));
	System.out.println("词库说明：" + QQPinyinQpydReader.substringBetween(dataString, "Intro: ", "\r\n"));
	System.out.println("词库样例：" + QQPinyinQpydReader.substringBetween(dataString, "Example: ", "\r\n"));
	System.out.println("词条数：" + dataRawBytes.getInt(0x44));

	// read zipped qqyd dictionary into byte array
	dataOut.reset();
	try (InflaterOutputStream inflater = new InflaterOutputStream(dataOut);) {
	Channels.newChannel(inflater).write(ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength));
	}

	// uncompressed qqyd dictionary as bytes
	final ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray());
	dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN);

	// for debugging: save unzipped data to *.unzipped file
	try (FileOutputStream out = new FileOutputStream(qqydFile + ".unzipped");) {
	Channels.newChannel(out).write(dataUnzippedBytes);
	System.out.println("压缩数据：0x" + Integer.toHexString(startZippedDictAddr) + " (解压前：" + zippedDictLength + " B, 解压后：" + dataUnzippedBytes.limit() + " B)");
	}

	// stores the start address of actual dictionary data
	int unzippedDictStartAddr = -1;
	final byte[] byteArray = dataUnzippedBytes.array();
	dataUnzippedBytes.position(0);
	while ((unzippedDictStartAddr == -1) \|\| (dataUnzippedBytes.position() < unzippedDictStartAddr)) {
	// read word
	final int pinyinLength = dataUnzippedBytes.get() & 0xff;
	final int wordLength = dataUnzippedBytes.get() & 0xff;
	dataUnzippedBytes.getInt(); // garbage
	final int pinyinStartAddr = dataUnzippedBytes.getInt();
	final int wordStartAddr = pinyinStartAddr + pinyinLength;

	if (unzippedDictStartAddr == -1) {
	unzippedDictStartAddr = pinyinStartAddr;
	System.out.println("词库地址（解压后）：0x" + Integer.toHexString(unzippedDictStartAddr) + "\n");
	}

	final String pinyin = new String(Arrays.copyOfRange(byteArray, pinyinStartAddr, pinyinStartAddr + pinyinLength), "UTF-8");
	final String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), "UTF-16LE");
	System.out.println(word + "\t" + pinyin);
	}
	}

	public static final String substringBetween(final String text, final String start, final String end) {
	final int nStart = text.indexOf(start);
	final int nEnd = text.indexOf(end, nStart + 1);
	if ((nStart != -1) && (nEnd != -1)) {
	return text.substring(nStart + start.length(), nEnd);
	} else {
	return null;
	}
	}
	}
	/* 搜狗拼音输入法SCEL词库文件解析
	* Copyright (c) 2010 Xiaoyun Zhu
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.nio.channels.Channels;
	import java.nio.channels.FileChannel;

	/**
	* Sougou Pinyin IME SCEL File Reader
	*
	* <pre>
	* SCEL Format overview:
	*
	* General Information:
	* - Chinese characters and pinyin are all encoded with UTF-16LE.
	* - Numbers are using little endian byte order.
	*
	* SCEL hex analysis:
	* - 0x0 Pinyin List Offset
	* - 0x120 total number of words
	* - 0x<PY-Offset> total number of pinyin
	* - ... List of pinyin as [index, byte length of pinyin, pinyin as string] triples
	* - ... Dictionary
	* - ... <additional garbage>
	*
	* Dictionary format:
	* - It can interpreted as a list of
	* [alternatives of words,
	* byte length of pinyin indexes, pinyin indexes,
	* [byte length of word, word as string, length of skip bytes, skip bytes]
	* ... (alternatives)
	* ].
	*
	* </pre>
	*
	*/
	class SogouScelReader {
	public static void main(final String[] args) throws IOException {

	// download from http://pinyin.sogou.com/dict
	final String scelFile = args[0];

	// read scel into byte array
	final ByteArrayOutputStream dataOut = new ByteArrayOutputStream();

	try (RandomAccessFile file = new RandomAccessFile(scelFile, "r"); final FileChannel fChannel = file.getChannel();) {
	fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut));
	}

	// scel as bytes
	final ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray());
	dataRawBytes.order(ByteOrder.LITTLE_ENDIAN);

	System.out.println("文件: " + scelFile);

	final byte[] buf = new byte[1024];
	final String[] pyDict = new String[512];

	final int totalWords = dataRawBytes.getInt(0x120);

	// pinyin offset
	dataRawBytes.position(dataRawBytes.getInt());
	final int totalPinyin = dataRawBytes.getInt();
	for (int i = 0; i < totalPinyin; i++) {
	final int idx = dataRawBytes.getShort();
	final int len = dataRawBytes.getShort();
	dataRawBytes.get(buf, 0, len);
	pyDict[idx] = new String(buf, 0, len, "UTF-16LE");
	}

	// extract dictionary
	int counter = 0;
	for (int i = 0; i < totalWords; i++) {
	final StringBuilder py = new StringBuilder();
	final StringBuilder word = new StringBuilder();

	int alternatives = dataRawBytes.getShort();
	int pyLength = dataRawBytes.getShort() / 2;
	boolean first = true;
	while (pyLength-- > 0) {
	final int key = dataRawBytes.getShort();
	if (first) {
	first = false;
	} else {
	py.append('\'');
	}
	py.append(pyDict[key]);
	}
	first = true;
	while (alternatives-- > 0) {
	if (first) {
	first = false;
	} else {
	word.append(", ");
	}
	final int wordlength = dataRawBytes.getShort();
	dataRawBytes.get(buf, 0, wordlength);
	word.append(new String(buf, 0, wordlength, "UTF-16LE"));
	// skip bytes
	dataRawBytes.get(buf, 0, dataRawBytes.getShort());
	}
	System.out.println(word.toString() + "\t" + py.toString());
	counter++;
	}
	System.out.println("\n读出词汇'" + scelFile + "': " + counter);
	}
	}