Skip to content

Instantly share code, notes, and snippets.

@rubyu
Last active July 5, 2019 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
[WIP] eb2json
import sbt.Keys._
import sbt._
import sbtassembly.AssemblyPlugin.autoImport._
object Build extends sbt.Build {
lazy val commonSettings =
Defaults.coreDefaultSettings ++
Seq(
version := "0.3.2",
scalaVersion := "2.11.8",
organization := "com.github.rubyu",
name := "ebquery"
)
lazy val project =
Project("ebquery", file("."))
.settings(commonSettings: _*)
.settings(Seq(
mainClass in assembly := Some("com.github.rubyu.ebquery.Main"),
assemblyJarName in assembly := name.value + "-" + version.value + ".jar"
))
.settings(Seq(
scalacOptions := Seq(
"-deprecation",
"-unchecked",
"-feature"
)
))
.settings(
libraryDependencies ++= Seq(
"com.github.tototoshi" %% "scala-csv" % "1.3.6",
"org.scala-lang.modules" % "scala-xml_2.11" % "1.0.4",
"org.slf4j" % "slf4j-api" % "1.7.21",
"org.slf4j" % "slf4j-simple" % "1.7.21",
"args4j" % "args4j" % "2.0.26",
"commons-codec" % "commons-codec" % "1.9",
"commons-lang" % "commons-lang" % "2.4",
"org.specs2" %% "specs2-core" % "3.7.2" % "test",
"junit" % "junit" % "4.7" % "test",
"com.rexsl" % "rexsl-w3c" % "0.13" % "test",
"com.rexsl" % "rexsl-test" % "0.4.12" % "test",
"javax.json" % "javax.json-api" % "1.0" % "test",
// halt warning messages for multiple dependencies
"org.scala-lang" % "scala-reflect" % "2.11.8" % "test",
"org.scala-lang" % "scala-compiler" % "2.11.8" % "test",
// halt warning messages for circular dependencies
"com.jcabi" % "jcabi-log" % "0.12.1" % "test"
)
)
}
package io.github.eb4j;
import java.io.UnsupportedEncodingException;
import io.github.eb4j.hook.Hook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.eb4j.io.EBFile;
import io.github.eb4j.io.BookInputStream;
import io.github.eb4j.util.ByteUtil;
import io.github.eb4j.util.HexUtil;
import com.github.rubyu.ebquery.IExporter;
/**
* Search class for searching with a single word.
*
* @author Hisaya FUKUMOTO
*
* Copied from: https://github.com/eb4j/eb4j/blob/5c1dd0a8aa6eca5ae7489787456333d7eef5fa2a/eb4j-core/src/main/java/io/github/eb4j/SingleWordSearcher.java
*/
public class EntryEnumerator implements Searcher {
private Hook _hook = null;
private com.github.rubyu.ebquery.IExporter _exporter = null;
public static EntryEnumerator Create(SubBook sub, Hook<String> hook, IExporter exporter) {
IndexStyle[] wordStyles = new IndexStyle[3];
wordStyles[0] = sub.getWordIndexStyle(0); // KANA
wordStyles[1] = sub.getWordIndexStyle(1); // KANJI
wordStyles[2] = sub.getWordIndexStyle(2); // ALPHABET
IndexStyle wordStyle;
if (wordStyles[2] != null) {
wordStyle = wordStyles[2];
} else {
wordStyle = wordStyles[1];
}
return new EntryEnumerator(sub, hook, exporter, wordStyle, EXACTWORD);
}
/** 前方一致検索を示す定数 */
protected static final int WORD = 0;
/** 後方一致検索を示す定数 */
protected static final int ENDWORD = 1;
/** 完全一致検索を示す定数 */
protected static final int EXACTWORD = 2;
/** 条件検索を示す定数 */
protected static final int KEYWORD = 3;
/** クロス検索を示す定数 */
protected static final int CROSS = 4;
/** 複合検索を示す定数 */
protected static final int MULTI = 5;
/** 最大インデックス深さ */
private static final int MAX_INDEX_DEPTH = 6;
/** 項目の配置スタイル */
private static final int VARIABLE = 0;
/** 項目の配置スタイル */
private static final int FIXED = 1;
/** ログ */
private Logger _logger = null;
/** 副本 */
private SubBook _sub = null;
/** インデックススタイル */
private IndexStyle _style = null;
/** 現在の検索種別 */
private int _type = 0;
/** 検索語 */
private byte[] _word = null;
/** 検索キー */
private byte[] _canonical = null;
/** 検索するファイル */
private EBFile _file = null;
/** キャッシュ */
private byte[] _cache = new byte[BookInputStream.PAGE_SIZE];
/** キャシュのページ位置 */
private long _cachePage = 0L;
/** キャシュのオフセット位置 */
private int _off = 0;
/** データのページ位置 */
private long _page = 0L;
/** データのページID */
private int _pageID = 0;
/** エントリのサイズ */
private int _entryLength = 0;
/** エントリの配置方法 */
private int _entryArrangement = 0;
/** エントリの数 */
private int _entryCount = 0;
/** エントリのインデックス */
private int _entryIndex = 0;
/** グループエントリ内であることを示すフラグ */
private boolean _inGroupEntry = false;
/** 比較結果 */
private int _comparison = -1;
/** キーワード検索用見出し位置 */
private long _keywordHeading = 0L;
private byte[] _currentGroupEntryIndex = null;
/**
* Build searcher object.
*
* @param sub subbook.
* @param style index style.
* @param type search type.
* @see SingleWordSearcher#WORD
* @see SingleWordSearcher#ENDWORD
* @see SingleWordSearcher#EXACTWORD
* @see SingleWordSearcher#KEYWORD
* @see SingleWordSearcher#CROSS
* @see SingleWordSearcher#MULTI
*/
protected EntryEnumerator(final SubBook sub, final Hook<String> hook, final IExporter exporter, final IndexStyle style, final int type) {
super();
_logger = LoggerFactory.getLogger(getClass());
_hook = hook;
_exporter = exporter;
_sub = sub;
_file = sub.getTextFile();
_style = style;
_type = type;
search();
}
private int _comparePre(final byte[] key, final byte[] pattern) {
return 0;
}
private int _compareSingle(byte[] key, byte[] pattern) {
return 1;
}
private int _compareGroup(byte[] key, byte[] pattern) {
return 0;
}
private void search() {
try {
search("dummy search string".getBytes());
} catch (EBException ex) {}
}
private void export(byte[] indexBytes, Result result) {
try {
String indexValue = null;
try {
indexValue = new String(indexBytes, "x-JIS0208");
} catch (UnsupportedEncodingException ex) {}
if (indexValue == null || indexValue.contains("\uFFFD")) {
return;
}
String heading = result.getHeading(this._hook).toString();
String description = result.getText(this._hook).toString();
this._exporter.export(indexValue, heading, description);
} catch (EBException ex) {}
}
/**
* Set a word to search.
*
* @param word a search word.
*/
private void _setWord(final byte[] word) {
int len = word.length;
_word = new byte[len];
System.arraycopy(word, 0, _word, 0, len);
_canonical = new byte[len];
System.arraycopy(word, 0, _canonical, 0, len);
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
_style.fixWordLatin(_canonical);
} else {
_style.fixWord(_canonical);
}
if (_style.getIndexID() != 0x70 && _style.getIndexID() != 0x90) {
System.arraycopy(_canonical, 0, _word, 0, len);
}
// 後方検索の場合、反転する
if (_type == ENDWORD) {
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
ByteUtil.reverseWordLatin(_word);
ByteUtil.reverseWordLatin(_canonical);
} else {
ByteUtil.reverseWord(_word);
ByteUtil.reverseWord(_canonical);
}
}
try {
_logger.debug("search word: '" + new String(_word, "x-JIS0208") + "'");
_logger.debug("search canonical word: '" + new String(_canonical, "x-JIS0208") + "'");
} catch (UnsupportedEncodingException e) {
}
}
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _comparePre(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, true);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// comp = CompareUtil.compareToByte(key, pattern, true);
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareSingle(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case ENDWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getEndwordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case KEYWORD:
// case CROSS:
// comp = CompareUtil.compareToByte(key, pattern, false);
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case WORD:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareGroup(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
/**
* 検索を行います。
*
* @param word 検索語
* @exception EBException 前処理中にエラーが発生した場合
*/
protected void search(final byte[] word) throws EBException {
_setWord(word);
_page = _style.getStartPage();
// pre-search
BookInputStream bis = _file.getInputStream();
try {
long nextPage = _page;
int depth;
for (depth=0; depth<MAX_INDEX_DEPTH; depth++) {
// データをキャッシュへ読み込む
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
_cachePage = _page;
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_off = 4;
_logger.debug("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
// リーフインデックスに達っしたらループ終了
if (_isLeafLayer(_pageID)) {
break;
}
// 次のレベルのインデックスを取得する
byte[] b = new byte[_entryLength];
for (_entryIndex=0; _entryIndex<_entryCount; _entryIndex++) {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
if (_comparePre(_canonical, b) <= 0) {
nextPage = ByteUtil.getLong4(_cache, _off);
break;
}
_off += 4;
}
if (_entryIndex >= _entryCount || nextPage == _page) {
_comparison = -1;
return;
}
_page = nextPage;
}
// インデックス深さのチェック
if (depth == MAX_INDEX_DEPTH) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
} finally {
bis.close();
}
_entryIndex = 0;
_comparison = 1;
_inGroupEntry = false;
}
/**
* 次の検索結果を返します。
*
* @return 検索結果 (次の検索結果がない場合null)
* @exception EBException 検索中にエラーが発生した場合
*/
@Override
public Result getNextResult() throws EBException {
if (_comparison < 0) {
return null;
}
while (true) {
refreshCache();
if (!_isLeafLayer(_pageID)) {
// リーフインデックスでなければ例外
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (!_hasGroupEntry(_pageID)) {
while (_entryIndex < _entryCount) {
parseNonGroupEntry();
}
} else {
while (_entryIndex < _entryCount) {
parseGroupedEntry();
}
}
// 次ページが存在すれば続行、存在しなければ終了
if (_isLayerEnd(_pageID)) {
_comparison = -1;
break;
}
_page++;
_entryIndex = 0;
}
return null;
}
/**
* 指定されたページが最下層かどうかを判別します。
*
* @param id ページID
* @return 最下層である場合はtrue、そうでない場合はfalse
*/
private boolean _isLeafLayer(final int id) {
if ((id & 0x80) == 0x80) {
return true;
}
return false;
}
// /**
// * 指定されたページが階層開始ページかどうかを判別します。
// *
// * @param id ページID
// * @return 階層開始ページである場合はtrue、そうでない場合はfalse
// */
// private boolean _isLayerStart(int id) {
// if ((id & 0x40) == 0x40) {
// return true;
// }
// return false;
// }
/**
* 指定されたページが階層終了ページかどうかを判別します。
*
* @param id ページID
* @return 階層終了ページである場合はtrue、そうでない場合はfalse
*/
private boolean _isLayerEnd(final int id) {
if ((id & 0x20) == 0x20) {
return true;
}
return false;
}
/**
* 指定されたページがグループエントリを含んでいるかどうか判別します。
*
* @param id ページID
* @return グループエントリを含んでいる場合はtrue、そうでない場合はfalse
*/
private boolean _hasGroupEntry(final int id) {
if ((id & 0x10) == 0x10) {
return true;
}
return false;
}
// キャッシュとデータのページが異なれば読み込む
private void refreshCache() throws EBException {
if (_cachePage != _page) {
BookInputStream bis = _file.getInputStream();
try {
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
} finally {
bis.close();
}
_cachePage = _page;
if (_entryIndex == 0) {
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_entryIndex = 0;
_off = 4;
_logger.info("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
}
}
}
// グループエントリなし
private void parseNonGroupEntry() throws EBException {
if (_entryArrangement == VARIABLE) {
if (_off + 1 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryLength = _cache[_off] & 0xff;
_off++;
}
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
_comparison = _compareSingle(_word, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_entryIndex++;
_off += 12;
}
// グループエントリあり
private void parseGroupedEntry() throws EBException {
if (_off + 2 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
int groupID = _cache[_off] & 0xff;
if (groupID == 0x00) {
// シングルエントリ
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
_comparison = _compareSingle(_canonical, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
_inGroupEntry = false;
} else if (groupID == 0x80) {
// グループエントリの開始
_entryLength = _cache[_off+1] & 0xff;
byte[] b = new byte[_entryLength];
if (_type == KEYWORD || _type == CROSS) {
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_off += _entryLength + 6;
_comparison = _compareSingle(_word, b);
long hPage = ByteUtil.getLong4(_cache, _off);
int hOff = ByteUtil.getInt2(_cache, _off+4);
_keywordHeading =
BookInputStream.getPosition(hPage, hOff);
_off += 6;
} else if (_type == MULTI) {
if (_off + _entryLength + 6 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_comparison = _compareSingle(_word, b);
_off += _entryLength + 6;
} else {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+4, b, 0, b.length);
_comparison = _compareSingle(_canonical, b);
_off += _entryLength + 4;
}
_currentGroupEntryIndex = b.clone();
_inGroupEntry = true;
} else if (groupID == 0xc0) {
// グループエントリの要素
if (_type == KEYWORD || _type == CROSS) {
if (_off + 7 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
Result result = new Result(_sub, _keywordHeading, tPage, tOff);
_keywordHeading =
_sub.getNextHeadingPosition(_keywordHeading);
export(_currentGroupEntryIndex, result);
}
_off += 7;
} else if (_type == MULTI) {
if (_off + 13 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
long hPage = ByteUtil.getLong4(_cache, _off+7);
int hOff = ByteUtil.getInt2(_cache, _off+11);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(_currentGroupEntryIndex, result);
}
_off += 13;
} else {
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
if (_comparison == 1 && _inGroupEntry
&& _compareGroup(_word, b) == 0) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
}
} else {
// 未知のID
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryIndex++;
}
}
// end of SingleWordSearcher.java
package com.github.rubyu.ebquery;
public interface IExporter {
public void export(String indexValue, String heading, String description);
}
package com.github.rubyu.ebquery
import java.io.{File, PrintWriter}
import io.github.eb4j.{Book, EntryEnumerator, SingleWordEnumerator}
import io.github.eb4j.io.{BookReader, EPWINGInputStream}
import java.lang.reflect.Method
import org.specs2.mutable.Specification
class JSONExporterTest extends Specification {
"EBLeaf" should {
"equal with a instance be of the same string" in {
val dir = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD"
var mapFilePath = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD/SRD.map"
val subBook = new Book(dir).getSubBook(0)
val mapper = new ExternalCharacterMapper(mapFilePath)
val proc = new EBProcessor
proc.newline = new EBProcessorImpl.text.Newline
proc.externalCharacter = new EBProcessorImpl.text.ReplacementCharacter
proc.text = new EBProcessorImpl.text.Text
val hook = new EBProcessorAdapter(subBook, mapper, proc)
val exporter = new TSVExporter(new PrintWriter((new File("SRD.dump.tsv"))))
val enumerator = EntryEnumerator.Create(subBook, hook, exporter)
// println (Iterator.continually(searcher.getNextResult()) takeWhile(_ != null) size)
// 1118269
// Iterator.continually(enumerator.getNextResult()) takeWhile(_ != null) foreach { result =>
// val h = result.getHeading(hook)
// val t = result.getText(hook)
// if (h.startsWith("get")) {
// println(s"{heading=$h, text=$t".replace("\n", "¥n"))
// }
// }
enumerator.getNextResult();
// {heading=ab·so·lute, text=ABSOLUTE ADDRESS.¥n━n.⇑¥n【1】絶対的なもの(↔relative).¥n
// {heading=act, text=ACT OF TOLERATION [UNIFORMITY].¥n(2)⦅しばしば A-⦆(会議の)記録,
// {heading=ac·tor, text=a BAD ACTOR.¥n【4】〘ローマ法〙原告(plaintiff), 弁護人(advocat
// {heading=age, text=STONE AGE¥nBRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe
// {heading=age, text=BRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of elec
// {heading=age, text=IRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of electronic commu
true mustEqual true
}
}
}
package com.github.rubyu.ebquery
import java.io.Writer
import com.github.tototoshi.csv._
class TSVExporter(w: Writer) extends IExporter {
implicit val format = new TSVFormat {}
val writer = CSVWriter.open(w)(format)
private def escape(s: String): String =
s.replaceAll(raw"\\", raw"\\\\").replaceAll(raw"\n", raw"\\n")
override def export(indexValue: String, heading: String, description: String): Unit =
writer.writeRow(List(escape(indexValue), escape(heading), escape(description)))
def close(): Unit = writer.close()
}
@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript

        {'base':'A', 'letters':'\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'},
        {'base':'AA','letters':'\uA732'},
        {'base':'AE','letters':'\u00C6\u01FC\u01E2'},
        {'base':'AO','letters':'\uA734'},
        {'base':'AU','letters':'\uA736'},
        {'base':'AV','letters':'\uA738\uA73A'},
        {'base':'AY','letters':'\uA73C'},
        {'base':'B', 'letters':'\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181'},
        {'base':'C', 'letters':'\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E'},
        {'base':'D', 'letters':'\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779\u00D0'},
        {'base':'DZ','letters':'\u01F1\u01C4'},
        {'base':'Dz','letters':'\u01F2\u01C5'},
        {'base':'E', 'letters':'\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E'},
        {'base':'F', 'letters':'\u0046\u24BB\uFF26\u1E1E\u0191\uA77B'},
        {'base':'G', 'letters':'\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E'},
        {'base':'H', 'letters':'\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D'},
        {'base':'I', 'letters':'\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197'},
        {'base':'J', 'letters':'\u004A\u24BF\uFF2A\u0134\u0248'},
        {'base':'K', 'letters':'\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2'},
        {'base':'L', 'letters':'\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780'},
        {'base':'LJ','letters':'\u01C7'},
        {'base':'Lj','letters':'\u01C8'},
        {'base':'M', 'letters':'\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C'},
        {'base':'N', 'letters':'\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4'},
        {'base':'NJ','letters':'\u01CA'},
        {'base':'Nj','letters':'\u01CB'},
        {'base':'O', 'letters':'\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C'},
        {'base':'OI','letters':'\u01A2'},
        {'base':'OO','letters':'\uA74E'},
        {'base':'OU','letters':'\u0222'},
        {'base':'OE','letters':'\u008C\u0152'},
        {'base':'oe','letters':'\u009C\u0153'},
        {'base':'P', 'letters':'\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754'},
        {'base':'Q', 'letters':'\u0051\u24C6\uFF31\uA756\uA758\u024A'},
        {'base':'R', 'letters':'\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782'},
        {'base':'S', 'letters':'\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784'},
        {'base':'T', 'letters':'\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786'},
        {'base':'TZ','letters':'\uA728'},
        {'base':'U', 'letters':'\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244'},
        {'base':'V', 'letters':'\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245'},
        {'base':'VY','letters':'\uA760'},
        {'base':'W', 'letters':'\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72'},
        {'base':'X', 'letters':'\u0058\u24CD\uFF38\u1E8A\u1E8C'},
        {'base':'Y', 'letters':'\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE'},
        {'base':'Z', 'letters':'\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762'},
        {'base':'a', 'letters':'\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250'},
        {'base':'aa','letters':'\uA733'},
        {'base':'ae','letters':'\u00E6\u01FD\u01E3'},
        {'base':'ao','letters':'\uA735'},
        {'base':'au','letters':'\uA737'},
        {'base':'av','letters':'\uA739\uA73B'},
        {'base':'ay','letters':'\uA73D'},
        {'base':'b', 'letters':'\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253'},
        {'base':'c', 'letters':'\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184'},
        {'base':'d', 'letters':'\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A'},
        {'base':'dz','letters':'\u01F3\u01C6'},
        {'base':'e', 'letters':'\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD'},
        {'base':'f', 'letters':'\u0066\u24D5\uFF46\u1E1F\u0192\uA77C'},
        {'base':'g', 'letters':'\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F'},
        {'base':'h', 'letters':'\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265'},
        {'base':'hv','letters':'\u0195'},
        {'base':'i', 'letters':'\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131'},
        {'base':'j', 'letters':'\u006A\u24D9\uFF4A\u0135\u01F0\u0249'},
        {'base':'k', 'letters':'\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3'},
        {'base':'l', 'letters':'\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747'},
        {'base':'lj','letters':'\u01C9'},
        {'base':'m', 'letters':'\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F'},
        {'base':'n', 'letters':'\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5'},
        {'base':'nj','letters':'\u01CC'},
        {'base':'o', 'letters':'\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275'},
        {'base':'oi','letters':'\u01A3'},
        {'base':'ou','letters':'\u0223'},
        {'base':'oo','letters':'\uA74F'},
        {'base':'p','letters':'\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755'},
        {'base':'q','letters':'\u0071\u24E0\uFF51\u024B\uA757\uA759'},
        {'base':'r','letters':'\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783'},
        {'base':'s','letters':'\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B'},
        {'base':'t','letters':'\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787'},
        {'base':'tz','letters':'\uA729'},
        {'base':'u','letters': '\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289'},
        {'base':'v','letters':'\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C'},
        {'base':'vy','letters':'\uA761'},
        {'base':'w','letters':'\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73'},
        {'base':'x','letters':'\u0078\u24E7\uFF58\u1E8B\u1E8D'},
        {'base':'y','letters':'\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF'},
        {'base':'z','letters':'\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763'}
\s+{'base':'([^']+)',\s*'letters':\s*'([^']+?)'}.*
  c("$1", "$2") + \
  c("A", "\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F") + \
  c("AA", "\uA732") + \
  c("AE", "\u00C6\u01FC\u01E2") + \
  c("AO", "\uA734") + \
  c("AU", "\uA736") + \
  c("AV", "\uA738\uA73A") + \
  c("AY", "\uA73C") + \
  c("B", "\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181") + \
  c("C", "\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E") + \
  c("D", "\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779\u00D0") + \
  c("DZ", "\u01F1\u01C4") + \
  c("Dz", "\u01F2\u01C5") + \
  c("E", "\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E") + \
  c("F", "\u0046\u24BB\uFF26\u1E1E\u0191\uA77B") + \
  c("G", "\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E") + \
  c("H", "\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D") + \
  c("I", "\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197") + \
  c("J", "\u004A\u24BF\uFF2A\u0134\u0248") + \
  c("K", "\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2") + \
  c("L", "\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780") + \
  c("LJ", "\u01C7") + \
  c("Lj", "\u01C8") + \
  c("M", "\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C") + \
  c("N", "\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4") + \
  c("NJ", "\u01CA") + \
  c("Nj", "\u01CB") + \
  c("O", "\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C") + \
  c("OI", "\u01A2") + \
  c("OO", "\uA74E") + \
  c("OU", "\u0222") + \
  c("OE", "\u008C\u0152") + \
  c("oe", "\u009C\u0153") + \
  c("P", "\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754") + \
  c("Q", "\u0051\u24C6\uFF31\uA756\uA758\u024A") + \
  c("R", "\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782") + \
  c("S", "\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784") + \
  c("T", "\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786") + \
  c("TZ", "\uA728") + \
  c("U", "\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244") + \
  c("V", "\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245") + \
  c("VY", "\uA760") + \
  c("W", "\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72") + \
  c("X", "\u0058\u24CD\uFF38\u1E8A\u1E8C") + \
  c("Y", "\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE") + \
  c("Z", "\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762") + \
  c("a", "\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250") + \
  c("aa", "\uA733") + \
  c("ae", "\u00E6\u01FD\u01E3") + \
  c("ao", "\uA735") + \
  c("au", "\uA737") + \
  c("av", "\uA739\uA73B") + \
  c("ay", "\uA73D") + \
  c("b", "\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253") + \
  c("c", "\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184") + \
  c("d", "\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A") + \
  c("dz", "\u01F3\u01C6") + \
  c("e", "\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD") + \
  c("f", "\u0066\u24D5\uFF46\u1E1F\u0192\uA77C") + \
  c("g", "\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F") + \
  c("h", "\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265") + \
  c("hv", "\u0195") + \
  c("i", "\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131") + \
  c("j", "\u006A\u24D9\uFF4A\u0135\u01F0\u0249") + \
  c("k", "\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3") + \
  c("l", "\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747") + \
  c("lj", "\u01C9") + \
  c("m", "\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F") + \
  c("n", "\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5") + \
  c("nj", "\u01CC") + \
  c("o", "\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275") + \
  c("oi", "\u01A3") + \
  c("ou", "\u0223") + \
  c("oo", "\uA74F") + \
  c("p", "\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755") + \
  c("q", "\u0071\u24E0\uFF51\u024B\uA757\uA759") + \
  c("r", "\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783") + \
  c("s", "\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B") + \
  c("t", "\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787") + \
  c("tz", "\uA729") + \
  c("u", "\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289") + \
  c("v", "\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C") + \
  c("vy", "\uA761") + \
  c("w", "\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73") + \
  c("x", "\u0078\u24E7\uFF58\u1E8B\u1E8D") + \
  c("y", "\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF") + \
  c("z", "\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763") + \

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(c2) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

動かない…

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

ダメ…

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "SELECT c2, LISTAGG(c3, '-') FROM test.tsv GROUP BY c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM test.tsv) GROUP BY lower_c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM test.tsv)) GROUP BY lower_c2"
OK

あれ…?

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

grep -E "\tA" SRD.dump.tsv > SRD.A.tsv

csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

再現した

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2 ORDER BY lower_c2 ASC"

OK!!!!
GROUP BYの誤用だった…

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

dump dump1 get get1 get2
のような、末尾に数字がついてる系の見出しをうまく拾えない

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"

+----+--------------+
| c2 | tail_trimmed |
+----+--------------+
| h1 | h |
| h2 | h |
| h3 | h |
+----+--------------+

csvq のSUBSTRは0オリジン…?

1	ha1	d1
2	haa1	d2
3	haaa2	d3
4	haaaa3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"
+--------+--------------+
| c2 | tail_trimmed |
+--------+--------------+
| ha1 | ha |
| haa1 | haa |
| haaa2 | haaa |
| haaaa3 | haaaa |
+--------+--------------+

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

csvq "SELECT INSTR('foo@example.com', '@');"

+-------------------------------+
| INSTR('foo@example.com', '@') |
+-------------------------------+
|                             3 |
+-------------------------------+

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

BQでやる

Google Driveにgzを置いて、外部テーブルとしてBQで設定

INSERT INTO `test.srd`
SELECT * FROM `test.test`

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

インデックスが0x00で開始されてるものだけフィルタすると、0xFFFDが含まれてるものをフィルタしたのより増加した。これは想定に反してる

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment