Skip to content

Instantly share code, notes, and snippets.

@rubyu
Last active July 5, 2019 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
[WIP] eb2json
import sbt.Keys._
import sbt._
import sbtassembly.AssemblyPlugin.autoImport._
object Build extends sbt.Build {
lazy val commonSettings =
Defaults.coreDefaultSettings ++
Seq(
version := "0.3.2",
scalaVersion := "2.11.8",
organization := "com.github.rubyu",
name := "ebquery"
)
lazy val project =
Project("ebquery", file("."))
.settings(commonSettings: _*)
.settings(Seq(
mainClass in assembly := Some("com.github.rubyu.ebquery.Main"),
assemblyJarName in assembly := name.value + "-" + version.value + ".jar"
))
.settings(Seq(
scalacOptions := Seq(
"-deprecation",
"-unchecked",
"-feature"
)
))
.settings(
libraryDependencies ++= Seq(
"com.github.tototoshi" %% "scala-csv" % "1.3.6",
"org.scala-lang.modules" % "scala-xml_2.11" % "1.0.4",
"org.slf4j" % "slf4j-api" % "1.7.21",
"org.slf4j" % "slf4j-simple" % "1.7.21",
"args4j" % "args4j" % "2.0.26",
"commons-codec" % "commons-codec" % "1.9",
"commons-lang" % "commons-lang" % "2.4",
"org.specs2" %% "specs2-core" % "3.7.2" % "test",
"junit" % "junit" % "4.7" % "test",
"com.rexsl" % "rexsl-w3c" % "0.13" % "test",
"com.rexsl" % "rexsl-test" % "0.4.12" % "test",
"javax.json" % "javax.json-api" % "1.0" % "test",
// halt warning messages for multiple dependencies
"org.scala-lang" % "scala-reflect" % "2.11.8" % "test",
"org.scala-lang" % "scala-compiler" % "2.11.8" % "test",
// halt warning messages for circular dependencies
"com.jcabi" % "jcabi-log" % "0.12.1" % "test"
)
)
}
package io.github.eb4j;
import java.io.UnsupportedEncodingException;
import io.github.eb4j.hook.Hook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.eb4j.io.EBFile;
import io.github.eb4j.io.BookInputStream;
import io.github.eb4j.util.ByteUtil;
import io.github.eb4j.util.HexUtil;
import com.github.rubyu.ebquery.IExporter;
/**
* Search class for searching with a single word.
*
* @author Hisaya FUKUMOTO
*
* Copied from: https://github.com/eb4j/eb4j/blob/5c1dd0a8aa6eca5ae7489787456333d7eef5fa2a/eb4j-core/src/main/java/io/github/eb4j/SingleWordSearcher.java
*/
public class EntryEnumerator implements Searcher {
private Hook _hook = null;
private com.github.rubyu.ebquery.IExporter _exporter = null;
public static EntryEnumerator Create(SubBook sub, Hook<String> hook, IExporter exporter) {
IndexStyle[] wordStyles = new IndexStyle[3];
wordStyles[0] = sub.getWordIndexStyle(0); // KANA
wordStyles[1] = sub.getWordIndexStyle(1); // KANJI
wordStyles[2] = sub.getWordIndexStyle(2); // ALPHABET
IndexStyle wordStyle;
if (wordStyles[2] != null) {
wordStyle = wordStyles[2];
} else {
wordStyle = wordStyles[1];
}
return new EntryEnumerator(sub, hook, exporter, wordStyle, EXACTWORD);
}
/** 前方一致検索を示す定数 */
protected static final int WORD = 0;
/** 後方一致検索を示す定数 */
protected static final int ENDWORD = 1;
/** 完全一致検索を示す定数 */
protected static final int EXACTWORD = 2;
/** 条件検索を示す定数 */
protected static final int KEYWORD = 3;
/** クロス検索を示す定数 */
protected static final int CROSS = 4;
/** 複合検索を示す定数 */
protected static final int MULTI = 5;
/** 最大インデックス深さ */
private static final int MAX_INDEX_DEPTH = 6;
/** 項目の配置スタイル */
private static final int VARIABLE = 0;
/** 項目の配置スタイル */
private static final int FIXED = 1;
/** ログ */
private Logger _logger = null;
/** 副本 */
private SubBook _sub = null;
/** インデックススタイル */
private IndexStyle _style = null;
/** 現在の検索種別 */
private int _type = 0;
/** 検索語 */
private byte[] _word = null;
/** 検索キー */
private byte[] _canonical = null;
/** 検索するファイル */
private EBFile _file = null;
/** キャッシュ */
private byte[] _cache = new byte[BookInputStream.PAGE_SIZE];
/** キャシュのページ位置 */
private long _cachePage = 0L;
/** キャシュのオフセット位置 */
private int _off = 0;
/** データのページ位置 */
private long _page = 0L;
/** データのページID */
private int _pageID = 0;
/** エントリのサイズ */
private int _entryLength = 0;
/** エントリの配置方法 */
private int _entryArrangement = 0;
/** エントリの数 */
private int _entryCount = 0;
/** エントリのインデックス */
private int _entryIndex = 0;
/** グループエントリ内であることを示すフラグ */
private boolean _inGroupEntry = false;
/** 比較結果 */
private int _comparison = -1;
/** キーワード検索用見出し位置 */
private long _keywordHeading = 0L;
private byte[] _currentGroupEntryIndex = null;
/**
* Build searcher object.
*
* @param sub subbook.
* @param style index style.
* @param type search type.
* @see SingleWordSearcher#WORD
* @see SingleWordSearcher#ENDWORD
* @see SingleWordSearcher#EXACTWORD
* @see SingleWordSearcher#KEYWORD
* @see SingleWordSearcher#CROSS
* @see SingleWordSearcher#MULTI
*/
protected EntryEnumerator(final SubBook sub, final Hook<String> hook, final IExporter exporter, final IndexStyle style, final int type) {
super();
_logger = LoggerFactory.getLogger(getClass());
_hook = hook;
_exporter = exporter;
_sub = sub;
_file = sub.getTextFile();
_style = style;
_type = type;
search();
}
private int _comparePre(final byte[] key, final byte[] pattern) {
return 0;
}
private int _compareSingle(byte[] key, byte[] pattern) {
return 1;
}
private int _compareGroup(byte[] key, byte[] pattern) {
return 0;
}
private void search() {
try {
search("dummy search string".getBytes());
} catch (EBException ex) {}
}
private void export(byte[] indexBytes, Result result) {
try {
String indexValue = null;
try {
indexValue = new String(indexBytes, "x-JIS0208");
} catch (UnsupportedEncodingException ex) {}
if (indexValue == null || indexValue.contains("\uFFFD")) {
return;
}
String heading = result.getHeading(this._hook).toString();
String description = result.getText(this._hook).toString();
this._exporter.export(indexValue, heading, description);
} catch (EBException ex) {}
}
/**
* Set a word to search.
*
* @param word a search word.
*/
private void _setWord(final byte[] word) {
int len = word.length;
_word = new byte[len];
System.arraycopy(word, 0, _word, 0, len);
_canonical = new byte[len];
System.arraycopy(word, 0, _canonical, 0, len);
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
_style.fixWordLatin(_canonical);
} else {
_style.fixWord(_canonical);
}
if (_style.getIndexID() != 0x70 && _style.getIndexID() != 0x90) {
System.arraycopy(_canonical, 0, _word, 0, len);
}
// 後方検索の場合、反転する
if (_type == ENDWORD) {
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
ByteUtil.reverseWordLatin(_word);
ByteUtil.reverseWordLatin(_canonical);
} else {
ByteUtil.reverseWord(_word);
ByteUtil.reverseWord(_canonical);
}
}
try {
_logger.debug("search word: '" + new String(_word, "x-JIS0208") + "'");
_logger.debug("search canonical word: '" + new String(_canonical, "x-JIS0208") + "'");
} catch (UnsupportedEncodingException e) {
}
}
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _comparePre(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, true);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// comp = CompareUtil.compareToByte(key, pattern, true);
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareSingle(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case ENDWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getEndwordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case KEYWORD:
// case CROSS:
// comp = CompareUtil.compareToByte(key, pattern, false);
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case WORD:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareGroup(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
/**
* 検索を行います。
*
* @param word 検索語
* @exception EBException 前処理中にエラーが発生した場合
*/
protected void search(final byte[] word) throws EBException {
_setWord(word);
_page = _style.getStartPage();
// pre-search
BookInputStream bis = _file.getInputStream();
try {
long nextPage = _page;
int depth;
for (depth=0; depth<MAX_INDEX_DEPTH; depth++) {
// データをキャッシュへ読み込む
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
_cachePage = _page;
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_off = 4;
_logger.debug("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
// リーフインデックスに達っしたらループ終了
if (_isLeafLayer(_pageID)) {
break;
}
// 次のレベルのインデックスを取得する
byte[] b = new byte[_entryLength];
for (_entryIndex=0; _entryIndex<_entryCount; _entryIndex++) {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
if (_comparePre(_canonical, b) <= 0) {
nextPage = ByteUtil.getLong4(_cache, _off);
break;
}
_off += 4;
}
if (_entryIndex >= _entryCount || nextPage == _page) {
_comparison = -1;
return;
}
_page = nextPage;
}
// インデックス深さのチェック
if (depth == MAX_INDEX_DEPTH) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
} finally {
bis.close();
}
_entryIndex = 0;
_comparison = 1;
_inGroupEntry = false;
}
/**
* 次の検索結果を返します。
*
* @return 検索結果 (次の検索結果がない場合null)
* @exception EBException 検索中にエラーが発生した場合
*/
@Override
public Result getNextResult() throws EBException {
if (_comparison < 0) {
return null;
}
while (true) {
refreshCache();
if (!_isLeafLayer(_pageID)) {
// リーフインデックスでなければ例外
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (!_hasGroupEntry(_pageID)) {
while (_entryIndex < _entryCount) {
parseNonGroupEntry();
}
} else {
while (_entryIndex < _entryCount) {
parseGroupedEntry();
}
}
// 次ページが存在すれば続行、存在しなければ終了
if (_isLayerEnd(_pageID)) {
_comparison = -1;
break;
}
_page++;
_entryIndex = 0;
}
return null;
}
/**
* 指定されたページが最下層かどうかを判別します。
*
* @param id ページID
* @return 最下層である場合はtrue、そうでない場合はfalse
*/
private boolean _isLeafLayer(final int id) {
if ((id & 0x80) == 0x80) {
return true;
}
return false;
}
// /**
// * 指定されたページが階層開始ページかどうかを判別します。
// *
// * @param id ページID
// * @return 階層開始ページである場合はtrue、そうでない場合はfalse
// */
// private boolean _isLayerStart(int id) {
// if ((id & 0x40) == 0x40) {
// return true;
// }
// return false;
// }
/**
* 指定されたページが階層終了ページかどうかを判別します。
*
* @param id ページID
* @return 階層終了ページである場合はtrue、そうでない場合はfalse
*/
private boolean _isLayerEnd(final int id) {
if ((id & 0x20) == 0x20) {
return true;
}
return false;
}
/**
* 指定されたページがグループエントリを含んでいるかどうか判別します。
*
* @param id ページID
* @return グループエントリを含んでいる場合はtrue、そうでない場合はfalse
*/
private boolean _hasGroupEntry(final int id) {
if ((id & 0x10) == 0x10) {
return true;
}
return false;
}
// キャッシュとデータのページが異なれば読み込む
private void refreshCache() throws EBException {
if (_cachePage != _page) {
BookInputStream bis = _file.getInputStream();
try {
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
} finally {
bis.close();
}
_cachePage = _page;
if (_entryIndex == 0) {
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_entryIndex = 0;
_off = 4;
_logger.info("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
}
}
}
// グループエントリなし
private void parseNonGroupEntry() throws EBException {
if (_entryArrangement == VARIABLE) {
if (_off + 1 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryLength = _cache[_off] & 0xff;
_off++;
}
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
_comparison = _compareSingle(_word, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_entryIndex++;
_off += 12;
}
// グループエントリあり
private void parseGroupedEntry() throws EBException {
if (_off + 2 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
int groupID = _cache[_off] & 0xff;
if (groupID == 0x00) {
// シングルエントリ
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
_comparison = _compareSingle(_canonical, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
_inGroupEntry = false;
} else if (groupID == 0x80) {
// グループエントリの開始
_entryLength = _cache[_off+1] & 0xff;
byte[] b = new byte[_entryLength];
if (_type == KEYWORD || _type == CROSS) {
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_off += _entryLength + 6;
_comparison = _compareSingle(_word, b);
long hPage = ByteUtil.getLong4(_cache, _off);
int hOff = ByteUtil.getInt2(_cache, _off+4);
_keywordHeading =
BookInputStream.getPosition(hPage, hOff);
_off += 6;
} else if (_type == MULTI) {
if (_off + _entryLength + 6 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_comparison = _compareSingle(_word, b);
_off += _entryLength + 6;
} else {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+4, b, 0, b.length);
_comparison = _compareSingle(_canonical, b);
_off += _entryLength + 4;
}
_currentGroupEntryIndex = b.clone();
_inGroupEntry = true;
} else if (groupID == 0xc0) {
// グループエントリの要素
if (_type == KEYWORD || _type == CROSS) {
if (_off + 7 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
Result result = new Result(_sub, _keywordHeading, tPage, tOff);
_keywordHeading =
_sub.getNextHeadingPosition(_keywordHeading);
export(_currentGroupEntryIndex, result);
}
_off += 7;
} else if (_type == MULTI) {
if (_off + 13 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
long hPage = ByteUtil.getLong4(_cache, _off+7);
int hOff = ByteUtil.getInt2(_cache, _off+11);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(_currentGroupEntryIndex, result);
}
_off += 13;
} else {
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
if (_comparison == 1 && _inGroupEntry
&& _compareGroup(_word, b) == 0) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
}
} else {
// 未知のID
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryIndex++;
}
}
// end of SingleWordSearcher.java
package com.github.rubyu.ebquery;
public interface IExporter {
public void export(String indexValue, String heading, String description);
}
package com.github.rubyu.ebquery
import java.io.{File, PrintWriter}
import io.github.eb4j.{Book, EntryEnumerator, SingleWordEnumerator}
import io.github.eb4j.io.{BookReader, EPWINGInputStream}
import java.lang.reflect.Method
import org.specs2.mutable.Specification
class JSONExporterTest extends Specification {
"EBLeaf" should {
"equal with a instance be of the same string" in {
val dir = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD"
var mapFilePath = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD/SRD.map"
val subBook = new Book(dir).getSubBook(0)
val mapper = new ExternalCharacterMapper(mapFilePath)
val proc = new EBProcessor
proc.newline = new EBProcessorImpl.text.Newline
proc.externalCharacter = new EBProcessorImpl.text.ReplacementCharacter
proc.text = new EBProcessorImpl.text.Text
val hook = new EBProcessorAdapter(subBook, mapper, proc)
val exporter = new TSVExporter(new PrintWriter((new File("SRD.dump.tsv"))))
val enumerator = EntryEnumerator.Create(subBook, hook, exporter)
// println (Iterator.continually(searcher.getNextResult()) takeWhile(_ != null) size)
// 1118269
// Iterator.continually(enumerator.getNextResult()) takeWhile(_ != null) foreach { result =>
// val h = result.getHeading(hook)
// val t = result.getText(hook)
// if (h.startsWith("get")) {
// println(s"{heading=$h, text=$t".replace("\n", "¥n"))
// }
// }
enumerator.getNextResult();
// {heading=ab·so·lute, text=ABSOLUTE ADDRESS.¥n━n.⇑¥n【1】絶対的なもの(↔relative).¥n
// {heading=act, text=ACT OF TOLERATION [UNIFORMITY].¥n(2)⦅しばしば A-⦆(会議の)記録,
// {heading=ac·tor, text=a BAD ACTOR.¥n【4】〘ローマ法〙原告(plaintiff), 弁護人(advocat
// {heading=age, text=STONE AGE¥nBRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe
// {heading=age, text=BRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of elec
// {heading=age, text=IRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of electronic commu
true mustEqual true
}
}
}
package com.github.rubyu.ebquery
import java.io.Writer
import com.github.tototoshi.csv._
class TSVExporter(w: Writer) extends IExporter {
implicit val format = new TSVFormat {}
val writer = CSVWriter.open(w)(format)
private def escape(s: String): String =
s.replaceAll(raw"\\", raw"\\\\").replaceAll(raw"\n", raw"\\n")
override def export(indexValue: String, heading: String, description: String): Unit =
writer.writeRow(List(escape(indexValue), escape(heading), escape(description)))
def close(): Unit = writer.close()
}
@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "SELECT c2, LISTAGG(c3, '-') FROM test.tsv GROUP BY c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM test.tsv) GROUP BY lower_c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM test.tsv)) GROUP BY lower_c2"
OK

あれ…?

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

grep -E "\tA" SRD.dump.tsv > SRD.A.tsv

csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

再現した

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2 ORDER BY lower_c2 ASC"

OK!!!!
GROUP BYの誤用だった…

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

dump dump1 get get1 get2
のような、末尾に数字がついてる系の見出しをうまく拾えない

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"

+----+--------------+
| c2 | tail_trimmed |
+----+--------------+
| h1 | h |
| h2 | h |
| h3 | h |
+----+--------------+

csvq のSUBSTRは0オリジン…?

1	ha1	d1
2	haa1	d2
3	haaa2	d3
4	haaaa3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"
+--------+--------------+
| c2 | tail_trimmed |
+--------+--------------+
| ha1 | ha |
| haa1 | haa |
| haaa2 | haaa |
| haaaa3 | haaaa |
+--------+--------------+

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

csvq "SELECT INSTR('foo@example.com', '@');"

+-------------------------------+
| INSTR('foo@example.com', '@') |
+-------------------------------+
|                             3 |
+-------------------------------+

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

BQでやる

Google Driveにgzを置いて、外部テーブルとしてBQで設定

INSERT INTO `test.srd`
SELECT * FROM `test.test`

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

インデックスが0x00で開始されてるものだけフィルタすると、0xFFFDが含まれてるものをフィルタしたのより増加した。これは想定に反してる

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment