-
-
Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
import sbt.Keys._ | |
import sbt._ | |
import sbtassembly.AssemblyPlugin.autoImport._ | |
object Build extends sbt.Build { | |
lazy val commonSettings = | |
Defaults.coreDefaultSettings ++ | |
Seq( | |
version := "0.3.2", | |
scalaVersion := "2.11.8", | |
organization := "com.github.rubyu", | |
name := "ebquery" | |
) | |
lazy val project = | |
Project("ebquery", file(".")) | |
.settings(commonSettings: _*) | |
.settings(Seq( | |
mainClass in assembly := Some("com.github.rubyu.ebquery.Main"), | |
assemblyJarName in assembly := name.value + "-" + version.value + ".jar" | |
)) | |
.settings(Seq( | |
scalacOptions := Seq( | |
"-deprecation", | |
"-unchecked", | |
"-feature" | |
) | |
)) | |
.settings( | |
libraryDependencies ++= Seq( | |
"com.github.tototoshi" %% "scala-csv" % "1.3.6", | |
"org.scala-lang.modules" % "scala-xml_2.11" % "1.0.4", | |
"org.slf4j" % "slf4j-api" % "1.7.21", | |
"org.slf4j" % "slf4j-simple" % "1.7.21", | |
"args4j" % "args4j" % "2.0.26", | |
"commons-codec" % "commons-codec" % "1.9", | |
"commons-lang" % "commons-lang" % "2.4", | |
"org.specs2" %% "specs2-core" % "3.7.2" % "test", | |
"junit" % "junit" % "4.7" % "test", | |
"com.rexsl" % "rexsl-w3c" % "0.13" % "test", | |
"com.rexsl" % "rexsl-test" % "0.4.12" % "test", | |
"javax.json" % "javax.json-api" % "1.0" % "test", | |
// halt warning messages for multiple dependencies | |
"org.scala-lang" % "scala-reflect" % "2.11.8" % "test", | |
"org.scala-lang" % "scala-compiler" % "2.11.8" % "test", | |
// halt warning messages for circular dependencies | |
"com.jcabi" % "jcabi-log" % "0.12.1" % "test" | |
) | |
) | |
} |
package io.github.eb4j; | |
import java.io.UnsupportedEncodingException; | |
import io.github.eb4j.hook.Hook; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import io.github.eb4j.io.EBFile; | |
import io.github.eb4j.io.BookInputStream; | |
import io.github.eb4j.util.ByteUtil; | |
import io.github.eb4j.util.HexUtil; | |
import com.github.rubyu.ebquery.IExporter; | |
/** | |
* Search class for searching with a single word. | |
* | |
* @author Hisaya FUKUMOTO | |
* | |
* Copied from: https://github.com/eb4j/eb4j/blob/5c1dd0a8aa6eca5ae7489787456333d7eef5fa2a/eb4j-core/src/main/java/io/github/eb4j/SingleWordSearcher.java | |
*/ | |
public class EntryEnumerator implements Searcher { | |
private Hook _hook = null; | |
private com.github.rubyu.ebquery.IExporter _exporter = null; | |
public static EntryEnumerator Create(SubBook sub, Hook<String> hook, IExporter exporter) { | |
IndexStyle[] wordStyles = new IndexStyle[3]; | |
wordStyles[0] = sub.getWordIndexStyle(0); // KANA | |
wordStyles[1] = sub.getWordIndexStyle(1); // KANJI | |
wordStyles[2] = sub.getWordIndexStyle(2); // ALPHABET | |
IndexStyle wordStyle; | |
if (wordStyles[2] != null) { | |
wordStyle = wordStyles[2]; | |
} else { | |
wordStyle = wordStyles[1]; | |
} | |
return new EntryEnumerator(sub, hook, exporter, wordStyle, EXACTWORD); | |
} | |
/** 前方一致検索を示す定数 */ | |
protected static final int WORD = 0; | |
/** 後方一致検索を示す定数 */ | |
protected static final int ENDWORD = 1; | |
/** 完全一致検索を示す定数 */ | |
protected static final int EXACTWORD = 2; | |
/** 条件検索を示す定数 */ | |
protected static final int KEYWORD = 3; | |
/** クロス検索を示す定数 */ | |
protected static final int CROSS = 4; | |
/** 複合検索を示す定数 */ | |
protected static final int MULTI = 5; | |
/** 最大インデックス深さ */ | |
private static final int MAX_INDEX_DEPTH = 6; | |
/** 項目の配置スタイル */ | |
private static final int VARIABLE = 0; | |
/** 項目の配置スタイル */ | |
private static final int FIXED = 1; | |
/** ログ */ | |
private Logger _logger = null; | |
/** 副本 */ | |
private SubBook _sub = null; | |
/** インデックススタイル */ | |
private IndexStyle _style = null; | |
/** 現在の検索種別 */ | |
private int _type = 0; | |
/** 検索語 */ | |
private byte[] _word = null; | |
/** 検索キー */ | |
private byte[] _canonical = null; | |
/** 検索するファイル */ | |
private EBFile _file = null; | |
/** キャッシュ */ | |
private byte[] _cache = new byte[BookInputStream.PAGE_SIZE]; | |
/** キャシュのページ位置 */ | |
private long _cachePage = 0L; | |
/** キャシュのオフセット位置 */ | |
private int _off = 0; | |
/** データのページ位置 */ | |
private long _page = 0L; | |
/** データのページID */ | |
private int _pageID = 0; | |
/** エントリのサイズ */ | |
private int _entryLength = 0; | |
/** エントリの配置方法 */ | |
private int _entryArrangement = 0; | |
/** エントリの数 */ | |
private int _entryCount = 0; | |
/** エントリのインデックス */ | |
private int _entryIndex = 0; | |
/** グループエントリ内であることを示すフラグ */ | |
private boolean _inGroupEntry = false; | |
/** 比較結果 */ | |
private int _comparison = -1; | |
/** キーワード検索用見出し位置 */ | |
private long _keywordHeading = 0L; | |
private byte[] _currentGroupEntryIndex = null; | |
/** | |
* Build searcher object. | |
* | |
* @param sub subbook. | |
* @param style index style. | |
* @param type search type. | |
* @see SingleWordSearcher#WORD | |
* @see SingleWordSearcher#ENDWORD | |
* @see SingleWordSearcher#EXACTWORD | |
* @see SingleWordSearcher#KEYWORD | |
* @see SingleWordSearcher#CROSS | |
* @see SingleWordSearcher#MULTI | |
*/ | |
protected EntryEnumerator(final SubBook sub, final Hook<String> hook, final IExporter exporter, final IndexStyle style, final int type) { | |
super(); | |
_logger = LoggerFactory.getLogger(getClass()); | |
_hook = hook; | |
_exporter = exporter; | |
_sub = sub; | |
_file = sub.getTextFile(); | |
_style = style; | |
_type = type; | |
search(); | |
} | |
private int _comparePre(final byte[] key, final byte[] pattern) { | |
return 0; | |
} | |
private int _compareSingle(byte[] key, byte[] pattern) { | |
return 1; | |
} | |
private int _compareGroup(byte[] key, byte[] pattern) { | |
return 0; | |
} | |
private void search() { | |
try { | |
search("dummy search string".getBytes()); | |
} catch (EBException ex) {} | |
} | |
private void export(byte[] indexBytes, Result result) { | |
try { | |
String indexValue = null; | |
try { | |
indexValue = new String(indexBytes, "x-JIS0208"); | |
} catch (UnsupportedEncodingException ex) {} | |
if (indexValue == null || indexValue.contains("\uFFFD")) { | |
return; | |
} | |
String heading = result.getHeading(this._hook).toString(); | |
String description = result.getText(this._hook).toString(); | |
this._exporter.export(indexValue, heading, description); | |
} catch (EBException ex) {} | |
} | |
/** | |
* Set a word to search. | |
* | |
* @param word a search word. | |
*/ | |
private void _setWord(final byte[] word) { | |
int len = word.length; | |
_word = new byte[len]; | |
System.arraycopy(word, 0, _word, 0, len); | |
_canonical = new byte[len]; | |
System.arraycopy(word, 0, _canonical, 0, len); | |
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
_style.fixWordLatin(_canonical); | |
} else { | |
_style.fixWord(_canonical); | |
} | |
if (_style.getIndexID() != 0x70 && _style.getIndexID() != 0x90) { | |
System.arraycopy(_canonical, 0, _word, 0, len); | |
} | |
// 後方検索の場合、反転する | |
if (_type == ENDWORD) { | |
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
ByteUtil.reverseWordLatin(_word); | |
ByteUtil.reverseWordLatin(_canonical); | |
} else { | |
ByteUtil.reverseWord(_word); | |
ByteUtil.reverseWord(_canonical); | |
} | |
} | |
try { | |
_logger.debug("search word: '" + new String(_word, "x-JIS0208") + "'"); | |
_logger.debug("search canonical word: '" + new String(_canonical, "x-JIS0208") + "'"); | |
} catch (UnsupportedEncodingException e) { | |
} | |
} | |
// /** | |
// * キーとパターンを比較します。 | |
// * | |
// * @param key キー | |
// * @param pattern パターン | |
// * @return キーがパターンと同じ場合:0、 | |
// * キーがパターンより大きい場合:1以上、 | |
// * キーがパターンより小さい場合:-1以下 | |
// */ | |
// private int _comparePre(final byte[] key, final byte[] pattern) { | |
// int comp = 0; | |
// switch (_type) { | |
// case EXACTWORD: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, true); | |
// } else { | |
// comp = CompareUtil.compareToJISX0208(key, pattern, true); | |
// } | |
// break; | |
// case MULTI: | |
// if (_style.getCandidatePage() == 0) { | |
// comp = CompareUtil.compareToByte(key, pattern, true); | |
// } else { | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, true); | |
// } else { | |
// comp = CompareUtil.compareToJISX0208(key, pattern, true); | |
// } | |
// } | |
// break; | |
// case WORD: | |
// case ENDWORD: | |
// case KEYWORD: | |
// case CROSS: | |
// default: | |
// comp = CompareUtil.compareToByte(key, pattern, true); | |
// break; | |
// } | |
// try { | |
// _logger.debug("compare key word: (" + comp + ") '" | |
// + new String(key, "x-JIS0208") + "' '" | |
// + new String(pattern, "x-JIS0208") + "'"); | |
// } catch (UnsupportedEncodingException e) { | |
// } | |
// return comp; | |
// } | |
// | |
// /** | |
// * キーとパターンを比較します。 | |
// * | |
// * @param key キー | |
// * @param pattern パターン | |
// * @return キーがパターンと同じ場合:0、 | |
// * キーがパターンより大きい場合:1以上、 | |
// * キーがパターンより小さい場合:-1以下 | |
// */ | |
// private int _compareSingle(final byte[] key, final byte[] pattern) { | |
// int comp = 0; | |
// switch (_type) { | |
// case ENDWORD: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } else { | |
// IndexStyle style = _sub.getEndwordIndexStyle(SubBook.KANA); | |
// if (style != null && _style.getStartPage() == style.getStartPage()) { | |
// comp = CompareUtil.compareToKanaSingle(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } | |
// } | |
// break; | |
// case EXACTWORD: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, false); | |
// } else { | |
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA); | |
// if (style != null && _style.getStartPage() == style.getStartPage()) { | |
// comp = CompareUtil.compareToKanaSingle(key, pattern, true); | |
// } else { | |
// comp = CompareUtil.compareToJISX0208(key, pattern, false); | |
// } | |
// } | |
// break; | |
// case KEYWORD: | |
// case CROSS: | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// break; | |
// case MULTI: | |
// if (_style.getCandidatePage() == 0) { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } else { | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToJISX0208(key, pattern, false); | |
// } | |
// } | |
// break; | |
// case WORD: | |
// default: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } else { | |
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA); | |
// if (style != null && _style.getStartPage() == style.getStartPage()) { | |
// comp = CompareUtil.compareToKanaSingle(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } | |
// } | |
// break; | |
// } | |
// try { | |
// _logger.debug("compare key word: (" + comp + ") '" | |
// + new String(key, "x-JIS0208") + "' '" | |
// + new String(pattern, "x-JIS0208") + "'"); | |
// } catch (UnsupportedEncodingException e) { | |
// } | |
// return comp; | |
// } | |
// | |
// /** | |
// * キーとパターンを比較します。 | |
// * | |
// * @param key キー | |
// * @param pattern パターン | |
// * @return キーがパターンと同じ場合:0、 | |
// * キーがパターンより大きい場合:1以上、 | |
// * キーがパターンより小さい場合:-1以下 | |
// */ | |
// private int _compareGroup(final byte[] key, final byte[] pattern) { | |
// int comp = 0; | |
// switch (_type) { | |
// case EXACTWORD: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToKanaGroup(key, pattern, true); | |
// } | |
// break; | |
// case MULTI: | |
// if (_style.getCandidatePage() == 0) { | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToKanaGroup(key, pattern, false); | |
// } | |
// } else { | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToLatin(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToKanaGroup(key, pattern, true); | |
// } | |
// } | |
// break; | |
// case WORD: | |
// case ENDWORD: | |
// case KEYWORD: | |
// case CROSS: | |
// default: | |
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) { | |
// comp = CompareUtil.compareToByte(key, pattern, false); | |
// } else { | |
// comp = CompareUtil.compareToKanaGroup(key, pattern, false); | |
// } | |
// break; | |
// } | |
// try { | |
// _logger.debug("compare key word: (" + comp + ") '" | |
// + new String(key, "x-JIS0208") + "' '" | |
// + new String(pattern, "x-JIS0208") + "'"); | |
// } catch (UnsupportedEncodingException e) { | |
// } | |
// return comp; | |
// } | |
/** | |
* 検索を行います。 | |
* | |
* @param word 検索語 | |
* @exception EBException 前処理中にエラーが発生した場合 | |
*/ | |
protected void search(final byte[] word) throws EBException { | |
_setWord(word); | |
_page = _style.getStartPage(); | |
// pre-search | |
BookInputStream bis = _file.getInputStream(); | |
try { | |
long nextPage = _page; | |
int depth; | |
for (depth=0; depth<MAX_INDEX_DEPTH; depth++) { | |
// データをキャッシュへ読み込む | |
bis.seek(_page, 0); | |
bis.readFully(_cache, 0, _cache.length); | |
_cachePage = _page; | |
_pageID = _cache[0] & 0xff; | |
_entryLength = _cache[1] & 0xff; | |
if (_entryLength == 0) { | |
_entryArrangement = VARIABLE; | |
} else { | |
_entryArrangement = FIXED; | |
} | |
_entryCount = ByteUtil.getInt2(_cache, 2); | |
_off = 4; | |
_logger.debug("page=0x" + HexUtil.toHexString(_page) | |
+ ", ID=0x" + HexUtil.toHexString(_pageID)); | |
// リーフインデックスに達っしたらループ終了 | |
if (_isLeafLayer(_pageID)) { | |
break; | |
} | |
// 次のレベルのインデックスを取得する | |
byte[] b = new byte[_entryLength]; | |
for (_entryIndex=0; _entryIndex<_entryCount; _entryIndex++) { | |
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
System.arraycopy(_cache, _off, b, 0, b.length); | |
_off += _entryLength; | |
if (_comparePre(_canonical, b) <= 0) { | |
nextPage = ByteUtil.getLong4(_cache, _off); | |
break; | |
} | |
_off += 4; | |
} | |
if (_entryIndex >= _entryCount || nextPage == _page) { | |
_comparison = -1; | |
return; | |
} | |
_page = nextPage; | |
} | |
// インデックス深さのチェック | |
if (depth == MAX_INDEX_DEPTH) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
} finally { | |
bis.close(); | |
} | |
_entryIndex = 0; | |
_comparison = 1; | |
_inGroupEntry = false; | |
} | |
/** | |
* 次の検索結果を返します。 | |
* | |
* @return 検索結果 (次の検索結果がない場合null) | |
* @exception EBException 検索中にエラーが発生した場合 | |
*/ | |
@Override | |
public Result getNextResult() throws EBException { | |
if (_comparison < 0) { | |
return null; | |
} | |
while (true) { | |
refreshCache(); | |
if (!_isLeafLayer(_pageID)) { | |
// リーフインデックスでなければ例外 | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
if (!_hasGroupEntry(_pageID)) { | |
while (_entryIndex < _entryCount) { | |
parseNonGroupEntry(); | |
} | |
} else { | |
while (_entryIndex < _entryCount) { | |
parseGroupedEntry(); | |
} | |
} | |
// 次ページが存在すれば続行、存在しなければ終了 | |
if (_isLayerEnd(_pageID)) { | |
_comparison = -1; | |
break; | |
} | |
_page++; | |
_entryIndex = 0; | |
} | |
return null; | |
} | |
/** | |
* 指定されたページが最下層かどうかを判別します。 | |
* | |
* @param id ページID | |
* @return 最下層である場合はtrue、そうでない場合はfalse | |
*/ | |
private boolean _isLeafLayer(final int id) { | |
if ((id & 0x80) == 0x80) { | |
return true; | |
} | |
return false; | |
} | |
// /** | |
// * 指定されたページが階層開始ページかどうかを判別します。 | |
// * | |
// * @param id ページID | |
// * @return 階層開始ページである場合はtrue、そうでない場合はfalse | |
// */ | |
// private boolean _isLayerStart(int id) { | |
// if ((id & 0x40) == 0x40) { | |
// return true; | |
// } | |
// return false; | |
// } | |
/** | |
* 指定されたページが階層終了ページかどうかを判別します。 | |
* | |
* @param id ページID | |
* @return 階層終了ページである場合はtrue、そうでない場合はfalse | |
*/ | |
private boolean _isLayerEnd(final int id) { | |
if ((id & 0x20) == 0x20) { | |
return true; | |
} | |
return false; | |
} | |
/** | |
* 指定されたページがグループエントリを含んでいるかどうか判別します。 | |
* | |
* @param id ページID | |
* @return グループエントリを含んでいる場合はtrue、そうでない場合はfalse | |
*/ | |
private boolean _hasGroupEntry(final int id) { | |
if ((id & 0x10) == 0x10) { | |
return true; | |
} | |
return false; | |
} | |
// キャッシュとデータのページが異なれば読み込む | |
private void refreshCache() throws EBException { | |
if (_cachePage != _page) { | |
BookInputStream bis = _file.getInputStream(); | |
try { | |
bis.seek(_page, 0); | |
bis.readFully(_cache, 0, _cache.length); | |
} finally { | |
bis.close(); | |
} | |
_cachePage = _page; | |
if (_entryIndex == 0) { | |
_pageID = _cache[0] & 0xff; | |
_entryLength = _cache[1] & 0xff; | |
if (_entryLength == 0) { | |
_entryArrangement = VARIABLE; | |
} else { | |
_entryArrangement = FIXED; | |
} | |
_entryCount = ByteUtil.getInt2(_cache, 2); | |
_entryIndex = 0; | |
_off = 4; | |
_logger.info("page=0x" + HexUtil.toHexString(_page) | |
+ ", ID=0x" + HexUtil.toHexString(_pageID)); | |
} | |
} | |
} | |
// グループエントリなし | |
private void parseNonGroupEntry() throws EBException { | |
if (_entryArrangement == VARIABLE) { | |
if (_off + 1 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
_entryLength = _cache[_off] & 0xff; | |
_off++; | |
} | |
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
byte[] b = new byte[_entryLength]; | |
System.arraycopy(_cache, _off, b, 0, b.length); | |
_off += _entryLength; | |
_comparison = _compareSingle(_word, b); | |
if (_comparison == 1) { | |
// 本文/見出し位置の取得 | |
long tPage = ByteUtil.getLong4(_cache, _off); | |
int tOff = ByteUtil.getInt2(_cache, _off+4); | |
long hPage = ByteUtil.getLong4(_cache, _off+6); | |
int hOff = ByteUtil.getInt2(_cache, _off+10); | |
Result result = new Result(_sub, hPage, hOff, tPage, tOff); | |
export(b, result); | |
} | |
_entryIndex++; | |
_off += 12; | |
} | |
// グループエントリあり | |
private void parseGroupedEntry() throws EBException { | |
if (_off + 2 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
int groupID = _cache[_off] & 0xff; | |
if (groupID == 0x00) { | |
// シングルエントリ | |
_entryLength = _cache[_off+1] & 0xff; | |
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
byte[] b = new byte[_entryLength]; | |
System.arraycopy(_cache, _off+2, b, 0, b.length); | |
_off += _entryLength + 2; | |
_comparison = _compareSingle(_canonical, b); | |
if (_comparison == 1) { | |
// 本文/見出し位置の取得 | |
long tPage = ByteUtil.getLong4(_cache, _off); | |
int tOff = ByteUtil.getInt2(_cache, _off+4); | |
long hPage = ByteUtil.getLong4(_cache, _off+6); | |
int hOff = ByteUtil.getInt2(_cache, _off+10); | |
Result result = new Result(_sub, hPage, hOff, tPage, tOff); | |
export(b, result); | |
} | |
_off += 12; | |
_inGroupEntry = false; | |
} else if (groupID == 0x80) { | |
// グループエントリの開始 | |
_entryLength = _cache[_off+1] & 0xff; | |
byte[] b = new byte[_entryLength]; | |
if (_type == KEYWORD || _type == CROSS) { | |
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
System.arraycopy(_cache, _off+6, b, 0, b.length); | |
_off += _entryLength + 6; | |
_comparison = _compareSingle(_word, b); | |
long hPage = ByteUtil.getLong4(_cache, _off); | |
int hOff = ByteUtil.getInt2(_cache, _off+4); | |
_keywordHeading = | |
BookInputStream.getPosition(hPage, hOff); | |
_off += 6; | |
} else if (_type == MULTI) { | |
if (_off + _entryLength + 6 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
System.arraycopy(_cache, _off+6, b, 0, b.length); | |
_comparison = _compareSingle(_word, b); | |
_off += _entryLength + 6; | |
} else { | |
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
System.arraycopy(_cache, _off+4, b, 0, b.length); | |
_comparison = _compareSingle(_canonical, b); | |
_off += _entryLength + 4; | |
} | |
_currentGroupEntryIndex = b.clone(); | |
_inGroupEntry = true; | |
} else if (groupID == 0xc0) { | |
// グループエントリの要素 | |
if (_type == KEYWORD || _type == CROSS) { | |
if (_off + 7 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
if (_comparison == 1 && _inGroupEntry) { | |
// 本文/見出し位置の取得 | |
long tPage = ByteUtil.getLong4(_cache, _off+1); | |
int tOff = ByteUtil.getInt2(_cache, _off+5); | |
Result result = new Result(_sub, _keywordHeading, tPage, tOff); | |
_keywordHeading = | |
_sub.getNextHeadingPosition(_keywordHeading); | |
export(_currentGroupEntryIndex, result); | |
} | |
_off += 7; | |
} else if (_type == MULTI) { | |
if (_off + 13 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
if (_comparison == 1 && _inGroupEntry) { | |
// 本文/見出し位置の取得 | |
long tPage = ByteUtil.getLong4(_cache, _off+1); | |
int tOff = ByteUtil.getInt2(_cache, _off+5); | |
long hPage = ByteUtil.getLong4(_cache, _off+7); | |
int hOff = ByteUtil.getInt2(_cache, _off+11); | |
Result result = new Result(_sub, hPage, hOff, tPage, tOff); | |
export(_currentGroupEntryIndex, result); | |
} | |
_off += 13; | |
} else { | |
_entryLength = _cache[_off+1] & 0xff; | |
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) { | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
byte[] b = new byte[_entryLength]; | |
System.arraycopy(_cache, _off+2, b, 0, b.length); | |
_off += _entryLength + 2; | |
if (_comparison == 1 && _inGroupEntry | |
&& _compareGroup(_word, b) == 0) { | |
// 本文/見出し位置の取得 | |
long tPage = ByteUtil.getLong4(_cache, _off); | |
int tOff = ByteUtil.getInt2(_cache, _off+4); | |
long hPage = ByteUtil.getLong4(_cache, _off+6); | |
int hOff = ByteUtil.getInt2(_cache, _off+10); | |
Result result = new Result(_sub, hPage, hOff, tPage, tOff); | |
export(b, result); | |
} | |
_off += 12; | |
} | |
} else { | |
// 未知のID | |
throw new EBException(EBException.UNEXP_FILE, _file.getPath()); | |
} | |
_entryIndex++; | |
} | |
} | |
// end of SingleWordSearcher.java |
package com.github.rubyu.ebquery; | |
public interface IExporter { | |
public void export(String indexValue, String heading, String description); | |
} |
package com.github.rubyu.ebquery | |
import java.io.{File, PrintWriter} | |
import io.github.eb4j.{Book, EntryEnumerator, SingleWordEnumerator} | |
import io.github.eb4j.io.{BookReader, EPWINGInputStream} | |
import java.lang.reflect.Method | |
import org.specs2.mutable.Specification | |
class JSONExporterTest extends Specification { | |
"EBLeaf" should { | |
"equal with a instance be of the same string" in { | |
val dir = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD" | |
var mapFilePath = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD/SRD.map" | |
val subBook = new Book(dir).getSubBook(0) | |
val mapper = new ExternalCharacterMapper(mapFilePath) | |
val proc = new EBProcessor | |
proc.newline = new EBProcessorImpl.text.Newline | |
proc.externalCharacter = new EBProcessorImpl.text.ReplacementCharacter | |
proc.text = new EBProcessorImpl.text.Text | |
val hook = new EBProcessorAdapter(subBook, mapper, proc) | |
val exporter = new TSVExporter(new PrintWriter((new File("SRD.dump.tsv")))) | |
val enumerator = EntryEnumerator.Create(subBook, hook, exporter) | |
// println (Iterator.continually(searcher.getNextResult()) takeWhile(_ != null) size) | |
// 1118269 | |
// Iterator.continually(enumerator.getNextResult()) takeWhile(_ != null) foreach { result => | |
// val h = result.getHeading(hook) | |
// val t = result.getText(hook) | |
// if (h.startsWith("get")) { | |
// println(s"{heading=$h, text=$t".replace("\n", "¥n")) | |
// } | |
// } | |
enumerator.getNextResult(); | |
// {heading=ab·so·lute, text=ABSOLUTE ADDRESS.¥n━n.⇑¥n【1】絶対的なもの(↔relative).¥n | |
// {heading=act, text=ACT OF TOLERATION [UNIFORMITY].¥n(2)⦅しばしば A-⦆(会議の)記録, | |
// {heading=ac·tor, text=a BAD ACTOR.¥n【4】〘ローマ法〙原告(plaintiff), 弁護人(advocat | |
// {heading=age, text=STONE AGE¥nBRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe | |
// {heading=age, text=BRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of elec | |
// {heading=age, text=IRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of electronic commu | |
true mustEqual true | |
} | |
} | |
} |
package com.github.rubyu.ebquery | |
import java.io.Writer | |
import com.github.tototoshi.csv._ | |
class TSVExporter(w: Writer) extends IExporter { | |
implicit val format = new TSVFormat {} | |
val writer = CSVWriter.open(w)(format) | |
private def escape(s: String): String = | |
s.replaceAll(raw"\\", raw"\\\\").replaceAll(raw"\n", raw"\\n") | |
override def export(indexValue: String, heading: String, description: String): Unit = | |
writer.writeRow(List(escape(indexValue), escape(heading), escape(description))) | |
def close(): Unit = writer.close() | |
} |
csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"
動かない…
csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"
ダメ…
1 h1 d1
2 h1 d2
3 h2 d3
4 h3 d3
csvq -no-header "SELECT c2, LISTAGG(c3, '-') FROM test.tsv
GROUP BY c2"
OK
csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM test.tsv
) GROUP BY lower_c2"
OK
csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM test.tsv
)) GROUP BY lower_c2"
OK
あれ…?
grep -E "\tA" SRD.dump.tsv > SRD.A.tsv
csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"
再現した
rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"
rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"
rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv
)) GROUP BY lower_c2 ORDER BY lower_c2 ASC"
OK!!!!
GROUP BYの誤用だった…
dump dump1 get get1 get2
のような、末尾に数字がついてる系の見出しをうまく拾えない
1 h1 d1
2 h1 d2
3 h2 d3
4 h3 d3
csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"
+----+--------------+
| c2 | tail_trimmed |
+----+--------------+
| h1 | h |
| h2 | h |
| h3 | h |
+----+--------------+
csvq のSUBSTRは0オリジン…?
1 ha1 d1
2 haa1 d2
3 haaa2 d3
4 haaaa3 d3
csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"
+--------+--------------+
| c2 | tail_trimmed |
+--------+--------------+
| ha1 | ha |
| haa1 | haa |
| haaa2 | haaa |
| haaaa3 | haaaa |
+--------+--------------+
csvq "SELECT INSTR('foo@example.com', '@');"
+-------------------------------+
| INSTR('foo@example.com', '@') |
+-------------------------------+
| 3 |
+-------------------------------+
BQでやる
Google Driveにgzを置いて、外部テーブルとしてBQで設定
INSERT INTO `test.srd`
SELECT * FROM `test.test`
インデックスが0x00で開始されてるものだけフィルタすると、0xFFFDが含まれてるものをフィルタしたのより増加した。これは想定に反してる
csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(c2) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM
SRD.dump.tsv
)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"