Skip to content

Instantly share code, notes, and snippets.

@rubyu
Last active July 5, 2019 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
Save rubyu/cb04e8457993a5e53cc6c1bc1d8d4796 to your computer and use it in GitHub Desktop.
[WIP] eb2json
import sbt.Keys._
import sbt._
import sbtassembly.AssemblyPlugin.autoImport._
object Build extends sbt.Build {
lazy val commonSettings =
Defaults.coreDefaultSettings ++
Seq(
version := "0.3.2",
scalaVersion := "2.11.8",
organization := "com.github.rubyu",
name := "ebquery"
)
lazy val project =
Project("ebquery", file("."))
.settings(commonSettings: _*)
.settings(Seq(
mainClass in assembly := Some("com.github.rubyu.ebquery.Main"),
assemblyJarName in assembly := name.value + "-" + version.value + ".jar"
))
.settings(Seq(
scalacOptions := Seq(
"-deprecation",
"-unchecked",
"-feature"
)
))
.settings(
libraryDependencies ++= Seq(
"com.github.tototoshi" %% "scala-csv" % "1.3.6",
"org.scala-lang.modules" % "scala-xml_2.11" % "1.0.4",
"org.slf4j" % "slf4j-api" % "1.7.21",
"org.slf4j" % "slf4j-simple" % "1.7.21",
"args4j" % "args4j" % "2.0.26",
"commons-codec" % "commons-codec" % "1.9",
"commons-lang" % "commons-lang" % "2.4",
"org.specs2" %% "specs2-core" % "3.7.2" % "test",
"junit" % "junit" % "4.7" % "test",
"com.rexsl" % "rexsl-w3c" % "0.13" % "test",
"com.rexsl" % "rexsl-test" % "0.4.12" % "test",
"javax.json" % "javax.json-api" % "1.0" % "test",
// halt warning messages for multiple dependencies
"org.scala-lang" % "scala-reflect" % "2.11.8" % "test",
"org.scala-lang" % "scala-compiler" % "2.11.8" % "test",
// halt warning messages for circular dependencies
"com.jcabi" % "jcabi-log" % "0.12.1" % "test"
)
)
}
package io.github.eb4j;
import java.io.UnsupportedEncodingException;
import io.github.eb4j.hook.Hook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.eb4j.io.EBFile;
import io.github.eb4j.io.BookInputStream;
import io.github.eb4j.util.ByteUtil;
import io.github.eb4j.util.HexUtil;
import com.github.rubyu.ebquery.IExporter;
/**
* Search class for searching with a single word.
*
* @author Hisaya FUKUMOTO
*
* Copied from: https://github.com/eb4j/eb4j/blob/5c1dd0a8aa6eca5ae7489787456333d7eef5fa2a/eb4j-core/src/main/java/io/github/eb4j/SingleWordSearcher.java
*/
public class EntryEnumerator implements Searcher {
private Hook _hook = null;
private com.github.rubyu.ebquery.IExporter _exporter = null;
public static EntryEnumerator Create(SubBook sub, Hook<String> hook, IExporter exporter) {
IndexStyle[] wordStyles = new IndexStyle[3];
wordStyles[0] = sub.getWordIndexStyle(0); // KANA
wordStyles[1] = sub.getWordIndexStyle(1); // KANJI
wordStyles[2] = sub.getWordIndexStyle(2); // ALPHABET
IndexStyle wordStyle;
if (wordStyles[2] != null) {
wordStyle = wordStyles[2];
} else {
wordStyle = wordStyles[1];
}
return new EntryEnumerator(sub, hook, exporter, wordStyle, EXACTWORD);
}
/** 前方一致検索を示す定数 */
protected static final int WORD = 0;
/** 後方一致検索を示す定数 */
protected static final int ENDWORD = 1;
/** 完全一致検索を示す定数 */
protected static final int EXACTWORD = 2;
/** 条件検索を示す定数 */
protected static final int KEYWORD = 3;
/** クロス検索を示す定数 */
protected static final int CROSS = 4;
/** 複合検索を示す定数 */
protected static final int MULTI = 5;
/** 最大インデックス深さ */
private static final int MAX_INDEX_DEPTH = 6;
/** 項目の配置スタイル */
private static final int VARIABLE = 0;
/** 項目の配置スタイル */
private static final int FIXED = 1;
/** ログ */
private Logger _logger = null;
/** 副本 */
private SubBook _sub = null;
/** インデックススタイル */
private IndexStyle _style = null;
/** 現在の検索種別 */
private int _type = 0;
/** 検索語 */
private byte[] _word = null;
/** 検索キー */
private byte[] _canonical = null;
/** 検索するファイル */
private EBFile _file = null;
/** キャッシュ */
private byte[] _cache = new byte[BookInputStream.PAGE_SIZE];
/** キャシュのページ位置 */
private long _cachePage = 0L;
/** キャシュのオフセット位置 */
private int _off = 0;
/** データのページ位置 */
private long _page = 0L;
/** データのページID */
private int _pageID = 0;
/** エントリのサイズ */
private int _entryLength = 0;
/** エントリの配置方法 */
private int _entryArrangement = 0;
/** エントリの数 */
private int _entryCount = 0;
/** エントリのインデックス */
private int _entryIndex = 0;
/** グループエントリ内であることを示すフラグ */
private boolean _inGroupEntry = false;
/** 比較結果 */
private int _comparison = -1;
/** キーワード検索用見出し位置 */
private long _keywordHeading = 0L;
private byte[] _currentGroupEntryIndex = null;
/**
* Build searcher object.
*
* @param sub subbook.
* @param style index style.
* @param type search type.
* @see SingleWordSearcher#WORD
* @see SingleWordSearcher#ENDWORD
* @see SingleWordSearcher#EXACTWORD
* @see SingleWordSearcher#KEYWORD
* @see SingleWordSearcher#CROSS
* @see SingleWordSearcher#MULTI
*/
protected EntryEnumerator(final SubBook sub, final Hook<String> hook, final IExporter exporter, final IndexStyle style, final int type) {
super();
_logger = LoggerFactory.getLogger(getClass());
_hook = hook;
_exporter = exporter;
_sub = sub;
_file = sub.getTextFile();
_style = style;
_type = type;
search();
}
private int _comparePre(final byte[] key, final byte[] pattern) {
return 0;
}
private int _compareSingle(byte[] key, byte[] pattern) {
return 1;
}
private int _compareGroup(byte[] key, byte[] pattern) {
return 0;
}
private void search() {
try {
search("dummy search string".getBytes());
} catch (EBException ex) {}
}
private void export(byte[] indexBytes, Result result) {
try {
String indexValue = null;
try {
indexValue = new String(indexBytes, "x-JIS0208");
} catch (UnsupportedEncodingException ex) {}
if (indexValue == null || indexValue.contains("\uFFFD")) {
return;
}
String heading = result.getHeading(this._hook).toString();
String description = result.getText(this._hook).toString();
this._exporter.export(indexValue, heading, description);
} catch (EBException ex) {}
}
/**
* Set a word to search.
*
* @param word a search word.
*/
private void _setWord(final byte[] word) {
int len = word.length;
_word = new byte[len];
System.arraycopy(word, 0, _word, 0, len);
_canonical = new byte[len];
System.arraycopy(word, 0, _canonical, 0, len);
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
_style.fixWordLatin(_canonical);
} else {
_style.fixWord(_canonical);
}
if (_style.getIndexID() != 0x70 && _style.getIndexID() != 0x90) {
System.arraycopy(_canonical, 0, _word, 0, len);
}
// 後方検索の場合、反転する
if (_type == ENDWORD) {
if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
ByteUtil.reverseWordLatin(_word);
ByteUtil.reverseWordLatin(_canonical);
} else {
ByteUtil.reverseWord(_word);
ByteUtil.reverseWord(_canonical);
}
}
try {
_logger.debug("search word: '" + new String(_word, "x-JIS0208") + "'");
_logger.debug("search canonical word: '" + new String(_canonical, "x-JIS0208") + "'");
} catch (UnsupportedEncodingException e) {
}
}
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _comparePre(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, true);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// comp = CompareUtil.compareToByte(key, pattern, true);
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareSingle(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case ENDWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getEndwordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, true);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case KEYWORD:
// case CROSS:
// comp = CompareUtil.compareToByte(key, pattern, false);
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToJISX0208(key, pattern, false);
// }
// }
// break;
// case WORD:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// IndexStyle style = _sub.getWordIndexStyle(SubBook.KANA);
// if (style != null && _style.getStartPage() == style.getStartPage()) {
// comp = CompareUtil.compareToKanaSingle(key, pattern, false);
// } else {
// comp = CompareUtil.compareToByte(key, pattern, false);
// }
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
//
// /**
// * キーとパターンを比較します。
// *
// * @param key キー
// * @param pattern パターン
// * @return キーがパターンと同じ場合:0、
// * キーがパターンより大きい場合:1以上、
// * キーがパターンより小さい場合:-1以下
// */
// private int _compareGroup(final byte[] key, final byte[] pattern) {
// int comp = 0;
// switch (_type) {
// case EXACTWORD:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// break;
// case MULTI:
// if (_style.getCandidatePage() == 0) {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// } else {
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToLatin(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, true);
// }
// }
// break;
// case WORD:
// case ENDWORD:
// case KEYWORD:
// case CROSS:
// default:
// if (_sub.getBook().getCharCode() == Book.CHARCODE_ISO8859_1) {
// comp = CompareUtil.compareToByte(key, pattern, false);
// } else {
// comp = CompareUtil.compareToKanaGroup(key, pattern, false);
// }
// break;
// }
// try {
// _logger.debug("compare key word: (" + comp + ") '"
// + new String(key, "x-JIS0208") + "' '"
// + new String(pattern, "x-JIS0208") + "'");
// } catch (UnsupportedEncodingException e) {
// }
// return comp;
// }
/**
* 検索を行います。
*
* @param word 検索語
* @exception EBException 前処理中にエラーが発生した場合
*/
protected void search(final byte[] word) throws EBException {
_setWord(word);
_page = _style.getStartPage();
// pre-search
BookInputStream bis = _file.getInputStream();
try {
long nextPage = _page;
int depth;
for (depth=0; depth<MAX_INDEX_DEPTH; depth++) {
// データをキャッシュへ読み込む
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
_cachePage = _page;
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_off = 4;
_logger.debug("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
// リーフインデックスに達っしたらループ終了
if (_isLeafLayer(_pageID)) {
break;
}
// 次のレベルのインデックスを取得する
byte[] b = new byte[_entryLength];
for (_entryIndex=0; _entryIndex<_entryCount; _entryIndex++) {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
if (_comparePre(_canonical, b) <= 0) {
nextPage = ByteUtil.getLong4(_cache, _off);
break;
}
_off += 4;
}
if (_entryIndex >= _entryCount || nextPage == _page) {
_comparison = -1;
return;
}
_page = nextPage;
}
// インデックス深さのチェック
if (depth == MAX_INDEX_DEPTH) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
} finally {
bis.close();
}
_entryIndex = 0;
_comparison = 1;
_inGroupEntry = false;
}
/**
* 次の検索結果を返します。
*
* @return 検索結果 (次の検索結果がない場合null)
* @exception EBException 検索中にエラーが発生した場合
*/
@Override
public Result getNextResult() throws EBException {
if (_comparison < 0) {
return null;
}
while (true) {
refreshCache();
if (!_isLeafLayer(_pageID)) {
// リーフインデックスでなければ例外
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (!_hasGroupEntry(_pageID)) {
while (_entryIndex < _entryCount) {
parseNonGroupEntry();
}
} else {
while (_entryIndex < _entryCount) {
parseGroupedEntry();
}
}
// 次ページが存在すれば続行、存在しなければ終了
if (_isLayerEnd(_pageID)) {
_comparison = -1;
break;
}
_page++;
_entryIndex = 0;
}
return null;
}
/**
* 指定されたページが最下層かどうかを判別します。
*
* @param id ページID
* @return 最下層である場合はtrue、そうでない場合はfalse
*/
private boolean _isLeafLayer(final int id) {
if ((id & 0x80) == 0x80) {
return true;
}
return false;
}
// /**
// * 指定されたページが階層開始ページかどうかを判別します。
// *
// * @param id ページID
// * @return 階層開始ページである場合はtrue、そうでない場合はfalse
// */
// private boolean _isLayerStart(int id) {
// if ((id & 0x40) == 0x40) {
// return true;
// }
// return false;
// }
/**
* 指定されたページが階層終了ページかどうかを判別します。
*
* @param id ページID
* @return 階層終了ページである場合はtrue、そうでない場合はfalse
*/
private boolean _isLayerEnd(final int id) {
if ((id & 0x20) == 0x20) {
return true;
}
return false;
}
/**
* 指定されたページがグループエントリを含んでいるかどうか判別します。
*
* @param id ページID
* @return グループエントリを含んでいる場合はtrue、そうでない場合はfalse
*/
private boolean _hasGroupEntry(final int id) {
if ((id & 0x10) == 0x10) {
return true;
}
return false;
}
// キャッシュとデータのページが異なれば読み込む
private void refreshCache() throws EBException {
if (_cachePage != _page) {
BookInputStream bis = _file.getInputStream();
try {
bis.seek(_page, 0);
bis.readFully(_cache, 0, _cache.length);
} finally {
bis.close();
}
_cachePage = _page;
if (_entryIndex == 0) {
_pageID = _cache[0] & 0xff;
_entryLength = _cache[1] & 0xff;
if (_entryLength == 0) {
_entryArrangement = VARIABLE;
} else {
_entryArrangement = FIXED;
}
_entryCount = ByteUtil.getInt2(_cache, 2);
_entryIndex = 0;
_off = 4;
_logger.info("page=0x" + HexUtil.toHexString(_page)
+ ", ID=0x" + HexUtil.toHexString(_pageID));
}
}
}
// グループエントリなし
private void parseNonGroupEntry() throws EBException {
if (_entryArrangement == VARIABLE) {
if (_off + 1 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryLength = _cache[_off] & 0xff;
_off++;
}
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off, b, 0, b.length);
_off += _entryLength;
_comparison = _compareSingle(_word, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_entryIndex++;
_off += 12;
}
// グループエントリあり
private void parseGroupedEntry() throws EBException {
if (_off + 2 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
int groupID = _cache[_off] & 0xff;
if (groupID == 0x00) {
// シングルエントリ
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
_comparison = _compareSingle(_canonical, b);
if (_comparison == 1) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
_inGroupEntry = false;
} else if (groupID == 0x80) {
// グループエントリの開始
_entryLength = _cache[_off+1] & 0xff;
byte[] b = new byte[_entryLength];
if (_type == KEYWORD || _type == CROSS) {
if (_off + _entryLength + 12 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_off += _entryLength + 6;
_comparison = _compareSingle(_word, b);
long hPage = ByteUtil.getLong4(_cache, _off);
int hOff = ByteUtil.getInt2(_cache, _off+4);
_keywordHeading =
BookInputStream.getPosition(hPage, hOff);
_off += 6;
} else if (_type == MULTI) {
if (_off + _entryLength + 6 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+6, b, 0, b.length);
_comparison = _compareSingle(_word, b);
_off += _entryLength + 6;
} else {
if (_off + _entryLength + 4 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
System.arraycopy(_cache, _off+4, b, 0, b.length);
_comparison = _compareSingle(_canonical, b);
_off += _entryLength + 4;
}
_currentGroupEntryIndex = b.clone();
_inGroupEntry = true;
} else if (groupID == 0xc0) {
// グループエントリの要素
if (_type == KEYWORD || _type == CROSS) {
if (_off + 7 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
Result result = new Result(_sub, _keywordHeading, tPage, tOff);
_keywordHeading =
_sub.getNextHeadingPosition(_keywordHeading);
export(_currentGroupEntryIndex, result);
}
_off += 7;
} else if (_type == MULTI) {
if (_off + 13 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
if (_comparison == 1 && _inGroupEntry) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off+1);
int tOff = ByteUtil.getInt2(_cache, _off+5);
long hPage = ByteUtil.getLong4(_cache, _off+7);
int hOff = ByteUtil.getInt2(_cache, _off+11);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(_currentGroupEntryIndex, result);
}
_off += 13;
} else {
_entryLength = _cache[_off+1] & 0xff;
if (_off + _entryLength + 14 > BookInputStream.PAGE_SIZE) {
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
byte[] b = new byte[_entryLength];
System.arraycopy(_cache, _off+2, b, 0, b.length);
_off += _entryLength + 2;
if (_comparison == 1 && _inGroupEntry
&& _compareGroup(_word, b) == 0) {
// 本文/見出し位置の取得
long tPage = ByteUtil.getLong4(_cache, _off);
int tOff = ByteUtil.getInt2(_cache, _off+4);
long hPage = ByteUtil.getLong4(_cache, _off+6);
int hOff = ByteUtil.getInt2(_cache, _off+10);
Result result = new Result(_sub, hPage, hOff, tPage, tOff);
export(b, result);
}
_off += 12;
}
} else {
// 未知のID
throw new EBException(EBException.UNEXP_FILE, _file.getPath());
}
_entryIndex++;
}
}
// end of SingleWordSearcher.java
package com.github.rubyu.ebquery;
public interface IExporter {
public void export(String indexValue, String heading, String description);
}
package com.github.rubyu.ebquery
import java.io.{File, PrintWriter}
import io.github.eb4j.{Book, EntryEnumerator, SingleWordEnumerator}
import io.github.eb4j.io.{BookReader, EPWINGInputStream}
import java.lang.reflect.Method
import org.specs2.mutable.Specification
class JSONExporterTest extends Specification {
"EBLeaf" should {
"equal with a instance be of the same string" in {
val dir = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD"
var mapFilePath = "/Users/i.y.nishiseki/Dropbox/work/dictionary/SRD/SRD.map"
val subBook = new Book(dir).getSubBook(0)
val mapper = new ExternalCharacterMapper(mapFilePath)
val proc = new EBProcessor
proc.newline = new EBProcessorImpl.text.Newline
proc.externalCharacter = new EBProcessorImpl.text.ReplacementCharacter
proc.text = new EBProcessorImpl.text.Text
val hook = new EBProcessorAdapter(subBook, mapper, proc)
val exporter = new TSVExporter(new PrintWriter((new File("SRD.dump.tsv"))))
val enumerator = EntryEnumerator.Create(subBook, hook, exporter)
// println (Iterator.continually(searcher.getNextResult()) takeWhile(_ != null) size)
// 1118269
// Iterator.continually(enumerator.getNextResult()) takeWhile(_ != null) foreach { result =>
// val h = result.getHeading(hook)
// val t = result.getText(hook)
// if (h.startsWith("get")) {
// println(s"{heading=$h, text=$t".replace("\n", "¥n"))
// }
// }
enumerator.getNextResult();
// {heading=ab·so·lute, text=ABSOLUTE ADDRESS.¥n━n.⇑¥n【1】絶対的なもの(↔relative).¥n
// {heading=act, text=ACT OF TOLERATION [UNIFORMITY].¥n(2)⦅しばしば A-⦆(会議の)記録,
// {heading=ac·tor, text=a BAD ACTOR.¥n【4】〘ローマ法〙原告(plaintiff), 弁護人(advocat
// {heading=age, text=STONE AGE¥nBRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe
// {heading=age, text=BRONZE AGE¥nIRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of elec
// {heading=age, text=IRON AGE¥nDARK AGES¥nMIDDLE AGES¥nthe age of electronic commu
true mustEqual true
}
}
}
package com.github.rubyu.ebquery
import java.io.Writer
import com.github.tototoshi.csv._
class TSVExporter(w: Writer) extends IExporter {
implicit val format = new TSVFormat {}
val writer = CSVWriter.open(w)(format)
private def escape(s: String): String =
s.replaceAll(raw"\\", raw"\\\\").replaceAll(raw"\n", raw"\\n")
override def export(indexValue: String, heading: String, description: String): Unit =
writer.writeRow(List(escape(indexValue), escape(heading), escape(description)))
def close(): Unit = writer.close()
}
@rubyu
Copy link
Author

rubyu commented Jun 27, 2019

_comparePre = 0
_compareSingle = 1
として、this._comparison == 0のif文を == 1 とするのがよさそう

_compareGroupについてはよくわからない
とりあえず = 1 で

@rubyu
Copy link
Author

rubyu commented Jun 27, 2019

_comparePre = cmp
_compareSingle = 1
で動作としては正しい?

@rubyu
Copy link
Author

rubyu commented Jun 27, 2019

先頭からしばらくの領域はそもそも指しているところがおかしいような?

@rubyu
Copy link
Author

rubyu commented Jun 27, 2019

Exporter, ExportFormatter, TSVExportFormatter として、ヤバいループ内でexport(Result r)をコールすればよさげ

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

 ~/ebquery> csvq -o SRD.distinct.sort.tsv -f TSV -without-header -no-header "SELECT DISTINCT c2, c3 FROM `SRD.dump.tsv` ORDER BY c2 ASC"

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.normalize.tsv -f TSV -without-header -no-header "SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(c2, '·', ''), '-', ''), 'ꝣ', 'z'), 'ⱬ', 'z'), 'ɀ', 'z'), 'ȥ', 'z'), 'ƶ', 'z'), 'ẕ', 'z'), 'ẓ', 'z'), 'ž', 'z'), 'ż', 'z'), 'ẑ', 'z'), 'ź', 'z'), 'z', 'z'), 'ⓩ', 'z'), 'z', 'z'), 'ỿ', 'y'), 'ɏ', 'y'), 'ƴ', 'y'), 'ỵ', 'y'), 'ẙ', 'y'), 'ỷ', 'y'), 'ÿ', 'y'), 'ẏ', 'y'), 'ȳ', 'y'), 'ỹ', 'y'), 'ŷ', 'y'), 'ý', 'y'), 'ỳ', 'y'), 'y', 'y'), 'ⓨ', 'y'), 'y', 'y'), 'ẍ', 'x'), 'ẋ', 'x'), 'x', 'x'), 'ⓧ', 'x'), 'x', 'x'), 'ⱳ', 'w'), 'ẉ', 'w'), 'ẘ', 'w'), 'ẅ', 'w'), 'ẇ', 'w'), 'ŵ', 'w'), 'ẃ', 'w'), 'ẁ', 'w'), 'w', 'w'), 'ⓦ', 'w'), 'w', 'w'), 'ʌ', 'v'), 'ꝟ', 'v'), 'ʋ', 'v'), 'ṿ', 'v'), 'ṽ', 'v'), 'v', 'v'), 'ⓥ', 'v'), 'v', 'v'), 'ʉ', 'u'), 'ṵ', 'u'), 'ṷ', 'u'), 'ų', 'u'), 'ṳ', 'u'), 'ụ', 'u'), 'ự', 'u'), 'ử', 'u'), 'ữ', 'u'), 'ứ', 'u'), 'ừ', 'u'), 'ư', 'u'), 'ȗ', 'u'), 'ȕ', 'u'), 'ǔ', 'u'), 'ű', 'u'), 'ů', 'u'), 'ủ', 'u'), 'ǚ', 'u'), 'ǖ', 'u'), 'ǘ', 'u'), 'ǜ', 'u'), 'ü', 'u'), 'ŭ', 'u'), 'ṻ', 'u'), 'ū', 'u'), 'ṹ', 'u'), 'ũ', 'u'), 'û', 'u'), 'ú', 'u'), 'ù', 'u'), 'u', 'u'), 'ⓤ', 'u'), 'u', 'u'), 'ꞇ', 't'), 'ⱦ', 't'), 'ʈ', 't'), 'ƭ', 't'), 'ŧ', 't'), 'ṯ', 't'), 'ṱ', 't'), 'ţ', 't'), 'ț', 't'), 'ṭ', 't'), 'ť', 't'), 'ẗ', 't'), 'ṫ', 't'), 't', 't'), 'ⓣ', 't'), 't', 't'), 'ẛ', 's'), 'ꞅ', 's'), 'ꞩ', 's'), 'ȿ', 's'), 'ş', 's'), 'ș', 's'), 'ṩ', 's'), 'ṣ', 's'), 'ṧ', 's'), 'š', 's'), 'ṡ', 's'), 'ŝ', 's'), 'ṥ', 's'), 'ś', 's'), 'ß', 's'), 's', 's'), 'ⓢ', 's'), 's', 's'), 'ꞃ', 'r'), 'ꞧ', 'r'), 'ꝛ', 'r'), 'ɽ', 'r'), 'ɍ', 'r'), 'ṟ', 'r'), 'ŗ', 'r'), 'ṝ', 'r'), 'ṛ', 'r'), 'ȓ', 'r'), 'ȑ', 'r'), 'ř', 'r'), 'ṙ', 'r'), 'ŕ', 'r'), 'r', 'r'), 'ⓡ', 'r'), 'r', 'r'), 'ꝙ', 'q'), 'ꝗ', 'q'), 'ɋ', 'q'), 'q', 'q'), 'ⓠ', 'q'), 'q', 'q'), 'ꝕ', 'p'), 'ꝓ', 'p'), 'ꝑ', 'p'), 'ᵽ', 'p'), 'ƥ', 'p'), 'ṗ', 'p'), 'ṕ', 'p'), 'p', 'p'), 'ⓟ', 'p'), 'p', 'p'), 'ɵ', 'o'), 'ꝍ', 'o'), 'ꝋ', 'o'), 'ɔ', 'o'), 'ǿ', 'o'), 'ø', 'o'), 'ǭ', 'o'), 'ǫ', 'o'), 'ộ', 'o'), 'ọ', 'o'), 'ợ', 'o'), 'ở', 'o'), 'ỡ', 'o'), 'ớ', 'o'), 'ờ', 'o'), 'ơ', 'o'), 'ȏ', 'o'), 'ȍ', 'o'), 'ǒ', 'o'), 'ő', 'o'), 'ỏ', 'o'), 'ȫ', 'o'), 'ö', 'o'), 'ȱ', 'o'), 'ȯ', 'o'), 'ŏ', 'o'), 'ṓ', 'o'), 'ṑ', 'o'), 'ō', 'o'), 'ṏ', 'o'), 'ȭ', 'o'), 'ṍ', 'o'), 'õ', 'o'), 'ổ', 'o'), 'ỗ', 'o'), 'ố', 'o'), 'ồ', 'o'), 'ô', 'o'), 'ó', 'o'), 'ò', 'o'), 'o', 'o'), 'ⓞ', 'o'), 'o', 'o'), 'ꞥ', 'n'), 'ꞑ', 'n'), 'ʼn', 'n'), 'ɲ', 'n'), 'ƞ', 'n'), 'ṉ', 'n'), 'ṋ', 'n'), 'ņ', 'n'), 'ṇ', 'n'), 'ň', 'n'), 'ṅ', 'n'), 'ñ', 'n'), 'ń', 'n'), 'ǹ', 'n'), 'n', 'n'), 'ⓝ', 'n'), 'n', 'n'), 'ɯ', 'm'), 'ɱ', 'm'), 'ṃ', 'm'), 'ṁ', 'm'), 'ḿ', 'm'), 'm', 'm'), 'ⓜ', 'm'), 'm', 'm'), 'ꝇ', 'l'), 'ꞁ', 'l'), 'ꝉ', 'l'), 'ⱡ', 'l'), 'ɫ', 'l'), 'ƚ', 'l'), 'ł', 'l'), 'ſ', 'l'), 'ḻ', 'l'), 'ḽ', 'l'), 'ļ', 'l'), 'ḹ', 'l'), 'ḷ', 'l'), 'ľ', 'l'), 'ĺ', 'l'), 'ŀ', 'l'), 'l', 'l'), 'ⓛ', 'l'), 'l', 'l'), 'ꞣ', 'k'), 'ꝅ', 'k'), 'ꝃ', 'k'), 'ꝁ', 'k'), 'ⱪ', 'k'), 'ƙ', 'k'), 'ḵ', 'k'), 'ķ', 'k'), 'ḳ', 'k'), 'ǩ', 'k'), 'ḱ', 'k'), 'k', 'k'), 'ⓚ', 'k'), 'k', 'k'), 'ɉ', 'j'), 'ǰ', 'j'), 'ĵ', 'j'), 'j', 'j'), 'ⓙ', 'j'), 'j', 'j'), 'ı', 'i'), 'ɨ', 'i'), 'ḭ', 'i'), 'į', 'i'), 'ị', 'i'), 'ȋ', 'i'), 'ȉ', 'i'), 'ǐ', 'i'), 'ỉ', 'i'), 'ḯ', 'i'), 'ï', 'i'), 'ĭ', 'i'), 'ī', 'i'), 'ĩ', 'i'), 'î', 'i'), 'í', 'i'), 'ì', 'i'), 'i', 'i'), 'ⓘ', 'i'), 'i', 'i'), 'ɥ', 'h'), 'ⱶ', 'h'), 'ⱨ', 'h'), 'ħ', 'h'), 'ẖ', 'h'), 'ḫ', 'h'), 'ḩ', 'h'), 'ḥ', 'h'), 'ȟ', 'h'), 'ḧ', 'h'), 'ḣ', 'h'), 'ĥ', 'h'), 'h', 'h'), 'ⓗ', 'h'), 'h', 'h'), 'ꝿ', 'g'), 'ᵹ', 'g'), 'ꞡ', 'g'), 'ɠ', 'g'), 'ǥ', 'g'), 'ģ', 'g'), 'ǧ', 'g'), 'ġ', 'g'), 'ğ', 'g'), 'ḡ', 'g'), 'ĝ', 'g'), 'ǵ', 'g'), 'g', 'g'), 'ⓖ', 'g'), 'g', 'g'), 'ꝼ', 'f'), 'ƒ', 'f'), 'ḟ', 'f'), 'f', 'f'), 'ⓕ', 'f'), 'f', 'f'), 'ǝ', 'e'), 'ɛ', 'e'), 'ɇ', 'e'), 'ḛ', 'e'), 'ḙ', 'e'), 'ę', 'e'), 'ḝ', 'e'), 'ȩ', 'e'), 'ệ', 'e'), 'ẹ', 'e'), 'ȇ', 'e'), 'ȅ', 'e'), 'ě', 'e'), 'ẻ', 'e'), 'ë', 'e'), 'ė', 'e'), 'ĕ', 'e'), 'ḗ', 'e'), 'ḕ', 'e'), 'ē', 'e'), 'ẽ', 'e'), 'ể', 'e'), 'ễ', 'e'), 'ế', 'e'), 'ề', 'e'), 'ê', 'e'), 'é', 'e'), 'è', 'e'), 'e', 'e'), 'ⓔ', 'e'), 'e', 'e'), 'ꝺ', 'd'), 'ɗ', 'd'), 'ɖ', 'd'), 'ƌ', 'd'), 'đ', 'd'), 'ḏ', 'd'), 'ḓ', 'd'), 'ḑ', 'd'), 'ḍ', 'd'), 'ď', 'd'), 'ḋ', 'd'), 'd', 'd'), 'ⓓ', 'd'), 'd', 'd'), 'ↄ', 'c'), 'ꜿ', 'c'), 'ȼ', 'c'), 'ƈ', 'c'), 'ḉ', 'c'), 'ç', 'c'), 'č', 'c'), 'ċ', 'c'), 'ĉ', 'c'), 'ć', 'c'), 'c', 'c'), 'ⓒ', 'c'), 'c', 'c'), 'ɓ', 'b'), 'ƃ', 'b'), 'ƀ', 'b'), 'ḇ', 'b'), 'ḅ', 'b'), 'ḃ', 'b'), 'b', 'b'), 'ⓑ', 'b'), 'b', 'b'), 'ɐ', 'a'), 'ⱥ', 'a'), 'ą', 'a'), 'ḁ', 'a'), 'ặ', 'a'), 'ậ', 'a'), 'ạ', 'a'), 'ȃ', 'a'), 'ȁ', 'a'), 'ǎ', 'a'), 'ǻ', 'a'), 'å', 'a'), 'ả', 'a'), 'ǟ', 'a'), 'ä', 'a'), 'ǡ', 'a'), 'ȧ', 'a'), 'ẳ', 'a'), 'ẵ', 'a'), 'ắ', 'a'), 'ằ', 'a'), 'ă', 'a'), 'ā', 'a'), 'ã', 'a'), 'ẩ', 'a'), 'ẫ', 'a'), 'ấ', 'a'), 'ầ', 'a'), 'â', 'a'), 'á', 'a'), 'à', 'a'), 'ẚ', 'a'), 'a', 'a'), 'ⓐ', 'a'), 'a', 'a'), 'Ꝣ', 'Z'), 'Ⱬ', 'Z'), 'Ɀ', 'Z'), 'Ȥ', 'Z'), 'Ƶ', 'Z'), 'Ẕ', 'Z'), 'Ẓ', 'Z'), 'Ž', 'Z'), 'Ż', 'Z'), 'Ẑ', 'Z'), 'Ź', 'Z'), 'Z', 'Z'), 'Ⓩ', 'Z'), 'Z', 'Z'), 'Ỿ', 'Y'), 'Ɏ', 'Y'), 'Ƴ', 'Y'), 'Ỵ', 'Y'), 'Ỷ', 'Y'), 'Ÿ', 'Y'), 'Ẏ', 'Y'), 'Ȳ', 'Y'), 'Ỹ', 'Y'), 'Ŷ', 'Y'), 'Ý', 'Y'), 'Ỳ', 'Y'), 'Y', 'Y'), 'Ⓨ', 'Y'), 'Y', 'Y'), 'Ẍ', 'X'), 'Ẋ', 'X'), 'X', 'X'), 'Ⓧ', 'X'), 'X', 'X'), 'Ⱳ', 'W'), 'Ẉ', 'W'), 'Ẅ', 'W'), 'Ẇ', 'W'), 'Ŵ', 'W'), 'Ẃ', 'W'), 'Ẁ', 'W'), 'W', 'W'), 'Ⓦ', 'W'), 'W', 'W'), 'Ʌ', 'V'), 'Ꝟ', 'V'), 'Ʋ', 'V'), 'Ṿ', 'V'), 'Ṽ', 'V'), 'V', 'V'), 'Ⓥ', 'V'), 'V', 'V'), 'Ʉ', 'U'), 'Ṵ', 'U'), 'Ṷ', 'U'), 'Ų', 'U'), 'Ṳ', 'U'), 'Ụ', 'U'), 'Ự', 'U'), 'Ử', 'U'), 'Ữ', 'U'), 'Ứ', 'U'), 'Ừ', 'U'), 'Ư', 'U'), 'Ȗ', 'U'), 'Ȕ', 'U'), 'Ǔ', 'U'), 'Ű', 'U'), 'Ů', 'U'), 'Ủ', 'U'), 'Ǚ', 'U'), 'Ǖ', 'U'), 'Ǘ', 'U'), 'Ǜ', 'U'), 'Ü', 'U'), 'Ŭ', 'U'), 'Ṻ', 'U'), 'Ū', 'U'), 'Ṹ', 'U'), 'Ũ', 'U'), 'Û', 'U'), 'Ú', 'U'), 'Ù', 'U'), 'U', 'U'), 'Ⓤ', 'U'), 'U', 'U'), 'Ꞇ', 'T'), 'Ⱦ', 'T'), 'Ʈ', 'T'), 'Ƭ', 'T'), 'Ŧ', 'T'), 'Ṯ', 'T'), 'Ṱ', 'T'), 'Ţ', 'T'), 'Ț', 'T'), 'Ṭ', 'T'), 'Ť', 'T'), 'Ṫ', 'T'), 'T', 'T'), 'Ⓣ', 'T'), 'T', 'T'), 'Ꞅ', 'S'), 'Ꞩ', 'S'), 'Ȿ', 'S'), 'Ş', 'S'), 'Ș', 'S'), 'Ṩ', 'S'), 'Ṣ', 'S'), 'Ṧ', 'S'), 'Š', 'S'), 'Ṡ', 'S'), 'Ŝ', 'S'), 'Ṥ', 'S'), 'Ś', 'S'), 'ẞ', 'S'), 'S', 'S'), 'Ⓢ', 'S'), 'S', 'S'), 'Ꞃ', 'R'), 'Ꞧ', 'R'), 'Ꝛ', 'R'), 'Ɽ', 'R'), 'Ɍ', 'R'), 'Ṟ', 'R'), 'Ŗ', 'R'), 'Ṝ', 'R'), 'Ṛ', 'R'), 'Ȓ', 'R'), 'Ȑ', 'R'), 'Ř', 'R'), 'Ṙ', 'R'), 'Ŕ', 'R'), 'R', 'R'), 'Ⓡ', 'R'), 'R', 'R'), 'Ɋ', 'Q'), 'Ꝙ', 'Q'), 'Ꝗ', 'Q'), 'Q', 'Q'), 'Ⓠ', 'Q'), 'Q', 'Q'), 'Ꝕ', 'P'), 'Ꝓ', 'P'), 'Ꝑ', 'P'), 'Ᵽ', 'P'), 'Ƥ', 'P'), 'Ṗ', 'P'), 'Ṕ', 'P'), 'P', 'P'), 'Ⓟ', 'P'), 'P', 'P'), 'Ꝍ', 'O'), 'Ꝋ', 'O'), 'Ɵ', 'O'), 'Ɔ', 'O'), 'Ǿ', 'O'), 'Ø', 'O'), 'Ǭ', 'O'), 'Ǫ', 'O'), 'Ộ', 'O'), 'Ọ', 'O'), 'Ợ', 'O'), 'Ở', 'O'), 'Ỡ', 'O'), 'Ớ', 'O'), 'Ờ', 'O'), 'Ơ', 'O'), 'Ȏ', 'O'), 'Ȍ', 'O'), 'Ǒ', 'O'), 'Ő', 'O'), 'Ỏ', 'O'), 'Ȫ', 'O'), 'Ö', 'O'), 'Ȱ', 'O'), 'Ȯ', 'O'), 'Ŏ', 'O'), 'Ṓ', 'O'), 'Ṑ', 'O'), 'Ō', 'O'), 'Ṏ', 'O'), 'Ȭ', 'O'), 'Ṍ', 'O'), 'Õ', 'O'), 'Ổ', 'O'), 'Ỗ', 'O'), 'Ố', 'O'), 'Ồ', 'O'), 'Ô', 'O'), 'Ó', 'O'), 'Ò', 'O'), 'O', 'O'), 'Ⓞ', 'O'), 'O', 'O'), 'Ꞥ', 'N'), 'Ꞑ', 'N'), 'Ɲ', 'N'), 'Ƞ', 'N'), 'Ṉ', 'N'), 'Ṋ', 'N'), 'Ņ', 'N'), 'Ṇ', 'N'), 'Ň', 'N'), 'Ṅ', 'N'), 'Ñ', 'N'), 'Ń', 'N'), 'Ǹ', 'N'), 'N', 'N'), 'Ⓝ', 'N'), 'N', 'N'), 'Ɯ', 'M'), 'Ɱ', 'M'), 'Ṃ', 'M'), 'Ṁ', 'M'), 'Ḿ', 'M'), 'M', 'M'), 'Ⓜ', 'M'), 'M', 'M'), 'Ꞁ', 'L'), 'Ꝇ', 'L'), 'Ꝉ', 'L'), 'Ⱡ', 'L'), 'Ɫ', 'L'), 'Ƚ', 'L'), 'Ł', 'L'), 'Ḻ', 'L'), 'Ḽ', 'L'), 'Ļ', 'L'), 'Ḹ', 'L'), 'Ḷ', 'L'), 'Ľ', 'L'), 'Ĺ', 'L'), 'Ŀ', 'L'), 'L', 'L'), 'Ⓛ', 'L'), 'L', 'L'), 'Ꞣ', 'K'), 'Ꝅ', 'K'), 'Ꝃ', 'K'), 'Ꝁ', 'K'), 'Ⱪ', 'K'), 'Ƙ', 'K'), 'Ḵ', 'K'), 'Ķ', 'K'), 'Ḳ', 'K'), 'Ǩ', 'K'), 'Ḱ', 'K'), 'K', 'K'), 'Ⓚ', 'K'), 'K', 'K'), 'Ɉ', 'J'), 'Ĵ', 'J'), 'J', 'J'), 'Ⓙ', 'J'), 'J', 'J'), 'Ɨ', 'I'), 'Ḭ', 'I'), 'Į', 'I'), 'Ị', 'I'), 'Ȋ', 'I'), 'Ȉ', 'I'), 'Ǐ', 'I'), 'Ỉ', 'I'), 'Ḯ', 'I'), 'Ï', 'I'), 'İ', 'I'), 'Ĭ', 'I'), 'Ī', 'I'), 'Ĩ', 'I'), 'Î', 'I'), 'Í', 'I'), 'Ì', 'I'), 'I', 'I'), 'Ⓘ', 'I'), 'I', 'I'), 'Ɥ', 'H'), 'Ⱶ', 'H'), 'Ⱨ', 'H'), 'Ħ', 'H'), 'Ḫ', 'H'), 'Ḩ', 'H'), 'Ḥ', 'H'), 'Ȟ', 'H'), 'Ḧ', 'H'), 'Ḣ', 'H'), 'Ĥ', 'H'), 'H', 'H'), 'Ⓗ', 'H'), 'H', 'H'), 'Ꝿ', 'G'), 'Ᵹ', 'G'), 'Ꞡ', 'G'), 'Ɠ', 'G'), 'Ǥ', 'G'), 'Ģ', 'G'), 'Ǧ', 'G'), 'Ġ', 'G'), 'Ğ', 'G'), 'Ḡ', 'G'), 'Ĝ', 'G'), 'Ǵ', 'G'), 'G', 'G'), 'Ⓖ', 'G'), 'G', 'G'), 'Ꝼ', 'F'), 'Ƒ', 'F'), 'Ḟ', 'F'), 'F', 'F'), 'Ⓕ', 'F'), 'F', 'F'), 'Ǝ', 'E'), 'Ɛ', 'E'), 'Ḛ', 'E'), 'Ḙ', 'E'), 'Ę', 'E'), 'Ḝ', 'E'), 'Ȩ', 'E'), 'Ệ', 'E'), 'Ẹ', 'E'), 'Ȇ', 'E'), 'Ȅ', 'E'), 'Ě', 'E'), 'Ẻ', 'E'), 'Ë', 'E'), 'Ė', 'E'), 'Ĕ', 'E'), 'Ḗ', 'E'), 'Ḕ', 'E'), 'Ē', 'E'), 'Ẽ', 'E'), 'Ể', 'E'), 'Ễ', 'E'), 'Ế', 'E'), 'Ề', 'E'), 'Ê', 'E'), 'É', 'E'), 'È', 'E'), 'E', 'E'), 'Ⓔ', 'E'), 'E', 'E'), 'Ð', 'D'), 'Ꝺ', 'D'), 'Ɖ', 'D'), 'Ɗ', 'D'), 'Ƌ', 'D'), 'Đ', 'D'), 'Ḏ', 'D'), 'Ḓ', 'D'), 'Ḑ', 'D'), 'Ḍ', 'D'), 'Ď', 'D'), 'Ḋ', 'D'), 'D', 'D'), 'Ⓓ', 'D'), 'D', 'D'), 'Ꜿ', 'C'), 'Ȼ', 'C'), 'Ƈ', 'C'), 'Ḉ', 'C'), 'Ç', 'C'), 'Č', 'C'), 'Ċ', 'C'), 'Ĉ', 'C'), 'Ć', 'C'), 'C', 'C'), 'Ⓒ', 'C'), 'C', 'C'), 'Ɓ', 'B'), 'Ƃ', 'B'), 'Ƀ', 'B'), 'Ḇ', 'B'), 'Ḅ', 'B'), 'Ḃ', 'B'), 'B', 'B'), 'Ⓑ', 'B'), 'B', 'B'), 'Ɐ', 'A'), 'Ⱥ', 'A'), 'Ą', 'A'), 'Ḁ', 'A'), 'Ặ', 'A'), 'Ậ', 'A'), 'Ạ', 'A'), 'Ȃ', 'A'), 'Ȁ', 'A'), 'Ǎ', 'A'), 'Ǻ', 'A'), 'Å', 'A'), 'Ả', 'A'), 'Ǟ', 'A'), 'Ä', 'A'), 'Ǡ', 'A'), 'Ȧ', 'A'), 'Ẳ', 'A'), 'Ẵ', 'A'), 'Ắ', 'A'), 'Ằ', 'A'), 'Ă', 'A'), 'Ā', 'A'), 'Ã', 'A'), 'Ẩ', 'A'), 'Ẫ', 'A'), 'Ấ', 'A'), 'Ầ', 'A'), 'Â', 'A'), 'Á', 'A'), 'À', 'A'), 'A', 'A'), 'Ⓐ', 'A'), 'A', 'A') AS normalized_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv) ORDER BY normalized_c2 ASC"

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

https://wandbox.org/permlink/usudtAkxvLDRXGXN

def c(vs, o):
  return [(v, o) for v in vs]

def to_sql_replace(xs):
  buf = []
  for x in xs:
    buf.append("REPLACE(")
  buf.append("target_column")
  for x in reversed(xs):
    buf.append(", '{}', '{}')".format(x[0], x[1]))
  return "".join(buf)

# https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript

ps = \
  c("\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F", "A") + \
  c("\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181", "B") + \
  c("\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E", "C") + \
  c("\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779\u00D0", "D") + \
  c("\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E", "E") + \
  c("\u0046\u24BB\uFF26\u1E1E\u0191\uA77B", "F") + \
  c("\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E", "G") + \
  c("\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D", "H") + \
  c("\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197", "I") + \
  c("\u004A\u24BF\uFF2A\u0134\u0248", "J") + \
  c("\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2", "K") + \
  c("\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780", "L") + \
  c("\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C", "M") + \
  c("\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4", "N") + \
  c("\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C", "O") + \
  c("\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754", "P") + \
  c("\u0051\u24C6\uFF31\uA756\uA758\u024A", "Q") + \
  c("\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782", "R") + \
  c("\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784", "S") + \
  c("\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786", "T") + \
  c("\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244", "U") + \
  c("\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245", "V") + \
  c("\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72", "W") + \
  c("\u0058\u24CD\uFF38\u1E8A\u1E8C", "X") + \
  c("\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE", "Y") + \
  c("\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762", "Z") + \
  c("\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250", "a") + \
  c("\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253", "b") + \
  c("\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184", "c") + \
  c("\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A", "d") + \
  c("\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD", "e") + \
  c("\u0066\u24D5\uFF46\u1E1F\u0192\uA77C", "f") + \
  c("\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F", "g") + \
  c("\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265", "h") + \
  c("\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131", "i") + \
  c("\u006A\u24D9\uFF4A\u0135\u01F0\u0249", "j") + \
  c("\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3", "k") + \
  c("\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747", "l") + \
  c("\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F", "m") + \
  c("\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5", "n") + \
  c("\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275", "o") + \
  c("\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755", "p") + \
  c("\u0071\u24E0\uFF51\u024B\uA757\uA759", "q") + \
  c("\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783", "r") + \
  c("\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B", "s") + \
  c("\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787", "t") + \
  c("\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289", "u") + \
  c("\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C", "v") + \
  c("\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73", "w") + \
  c("\u0078\u24E7\uFF58\u1E8B\u1E8D", "x") + \
  c("\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF", "y") + \
  c("\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763", "z") + \
  c("-", "") + \
  c("·", "")

print(to_sql_replace(ps))

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(target_column, '·', ''), '-', ''), 'ꝣ', 'z'), 'ⱬ', 'z'), 'ɀ', 'z'), 'ȥ', 'z'), 'ƶ', 'z'), 'ẕ', 'z'), 'ẓ', 'z'), 'ž', 'z'), 'ż', 'z'), 'ẑ', 'z'), 'ź', 'z'), 'z', 'z'), 'ⓩ', 'z'), 'z', 'z'), 'ỿ', 'y'), 'ɏ', 'y'), 'ƴ', 'y'), 'ỵ', 'y'), 'ẙ', 'y'), 'ỷ', 'y'), 'ÿ', 'y'), 'ẏ', 'y'), 'ȳ', 'y'), 'ỹ', 'y'), 'ŷ', 'y'), 'ý', 'y'), 'ỳ', 'y'), 'y', 'y'), 'ⓨ', 'y'), 'y', 'y'), 'ẍ', 'x'), 'ẋ', 'x'), 'x', 'x'), 'ⓧ', 'x'), 'x', 'x'), 'ⱳ', 'w'), 'ẉ', 'w'), 'ẘ', 'w'), 'ẅ', 'w'), 'ẇ', 'w'), 'ŵ', 'w'), 'ẃ', 'w'), 'ẁ', 'w'), 'w', 'w'), 'ⓦ', 'w'), 'w', 'w'), 'ʌ', 'v'), 'ꝟ', 'v'), 'ʋ', 'v'), 'ṿ', 'v'), 'ṽ', 'v'), 'v', 'v'), 'ⓥ', 'v'), 'v', 'v'), 'ʉ', 'u'), 'ṵ', 'u'), 'ṷ', 'u'), 'ų', 'u'), 'ṳ', 'u'), 'ụ', 'u'), 'ự', 'u'), 'ử', 'u'), 'ữ', 'u'), 'ứ', 'u'), 'ừ', 'u'), 'ư', 'u'), 'ȗ', 'u'), 'ȕ', 'u'), 'ǔ', 'u'), 'ű', 'u'), 'ů', 'u'), 'ủ', 'u'), 'ǚ', 'u'), 'ǖ', 'u'), 'ǘ', 'u'), 'ǜ', 'u'), 'ü', 'u'), 'ŭ', 'u'), 'ṻ', 'u'), 'ū', 'u'), 'ṹ', 'u'), 'ũ', 'u'), 'û', 'u'), 'ú', 'u'), 'ù', 'u'), 'u', 'u'), 'ⓤ', 'u'), 'u', 'u'), 'ꞇ', 't'), 'ⱦ', 't'), 'ʈ', 't'), 'ƭ', 't'), 'ŧ', 't'), 'ṯ', 't'), 'ṱ', 't'), 'ţ', 't'), 'ț', 't'), 'ṭ', 't'), 'ť', 't'), 'ẗ', 't'), 'ṫ', 't'), 't', 't'), 'ⓣ', 't'), 't', 't'), 'ẛ', 's'), 'ꞅ', 's'), 'ꞩ', 's'), 'ȿ', 's'), 'ş', 's'), 'ș', 's'), 'ṩ', 's'), 'ṣ', 's'), 'ṧ', 's'), 'š', 's'), 'ṡ', 's'), 'ŝ', 's'), 'ṥ', 's'), 'ś', 's'), 'ß', 's'), 's', 's'), 'ⓢ', 's'), 's', 's'), 'ꞃ', 'r'), 'ꞧ', 'r'), 'ꝛ', 'r'), 'ɽ', 'r'), 'ɍ', 'r'), 'ṟ', 'r'), 'ŗ', 'r'), 'ṝ', 'r'), 'ṛ', 'r'), 'ȓ', 'r'), 'ȑ', 'r'), 'ř', 'r'), 'ṙ', 'r'), 'ŕ', 'r'), 'r', 'r'), 'ⓡ', 'r'), 'r', 'r'), 'ꝙ', 'q'), 'ꝗ', 'q'), 'ɋ', 'q'), 'q', 'q'), 'ⓠ', 'q'), 'q', 'q'), 'ꝕ', 'p'), 'ꝓ', 'p'), 'ꝑ', 'p'), 'ᵽ', 'p'), 'ƥ', 'p'), 'ṗ', 'p'), 'ṕ', 'p'), 'p', 'p'), 'ⓟ', 'p'), 'p', 'p'), 'ɵ', 'o'), 'ꝍ', 'o'), 'ꝋ', 'o'), 'ɔ', 'o'), 'ǿ', 'o'), 'ø', 'o'), 'ǭ', 'o'), 'ǫ', 'o'), 'ộ', 'o'), 'ọ', 'o'), 'ợ', 'o'), 'ở', 'o'), 'ỡ', 'o'), 'ớ', 'o'), 'ờ', 'o'), 'ơ', 'o'), 'ȏ', 'o'), 'ȍ', 'o'), 'ǒ', 'o'), 'ő', 'o'), 'ỏ', 'o'), 'ȫ', 'o'), 'ö', 'o'), 'ȱ', 'o'), 'ȯ', 'o'), 'ŏ', 'o'), 'ṓ', 'o'), 'ṑ', 'o'), 'ō', 'o'), 'ṏ', 'o'), 'ȭ', 'o'), 'ṍ', 'o'), 'õ', 'o'), 'ổ', 'o'), 'ỗ', 'o'), 'ố', 'o'), 'ồ', 'o'), 'ô', 'o'), 'ó', 'o'), 'ò', 'o'), 'o', 'o'), 'ⓞ', 'o'), 'o', 'o'), 'ꞥ', 'n'), 'ꞑ', 'n'), 'ʼn', 'n'), 'ɲ', 'n'), 'ƞ', 'n'), 'ṉ', 'n'), 'ṋ', 'n'), 'ņ', 'n'), 'ṇ', 'n'), 'ň', 'n'), 'ṅ', 'n'), 'ñ', 'n'), 'ń', 'n'), 'ǹ', 'n'), 'n', 'n'), 'ⓝ', 'n'), 'n', 'n'), 'ɯ', 'm'), 'ɱ', 'm'), 'ṃ', 'm'), 'ṁ', 'm'), 'ḿ', 'm'), 'm', 'm'), 'ⓜ', 'm'), 'm', 'm'), 'ꝇ', 'l'), 'ꞁ', 'l'), 'ꝉ', 'l'), 'ⱡ', 'l'), 'ɫ', 'l'), 'ƚ', 'l'), 'ł', 'l'), 'ſ', 'l'), 'ḻ', 'l'), 'ḽ', 'l'), 'ļ', 'l'), 'ḹ', 'l'), 'ḷ', 'l'), 'ľ', 'l'), 'ĺ', 'l'), 'ŀ', 'l'), 'l', 'l'), 'ⓛ', 'l'), 'l', 'l'), 'ꞣ', 'k'), 'ꝅ', 'k'), 'ꝃ', 'k'), 'ꝁ', 'k'), 'ⱪ', 'k'), 'ƙ', 'k'), 'ḵ', 'k'), 'ķ', 'k'), 'ḳ', 'k'), 'ǩ', 'k'), 'ḱ', 'k'), 'k', 'k'), 'ⓚ', 'k'), 'k', 'k'), 'ɉ', 'j'), 'ǰ', 'j'), 'ĵ', 'j'), 'j', 'j'), 'ⓙ', 'j'), 'j', 'j'), 'ı', 'i'), 'ɨ', 'i'), 'ḭ', 'i'), 'į', 'i'), 'ị', 'i'), 'ȋ', 'i'), 'ȉ', 'i'), 'ǐ', 'i'), 'ỉ', 'i'), 'ḯ', 'i'), 'ï', 'i'), 'ĭ', 'i'), 'ī', 'i'), 'ĩ', 'i'), 'î', 'i'), 'í', 'i'), 'ì', 'i'), 'i', 'i'), 'ⓘ', 'i'), 'i', 'i'), 'ɥ', 'h'), 'ⱶ', 'h'), 'ⱨ', 'h'), 'ħ', 'h'), 'ẖ', 'h'), 'ḫ', 'h'), 'ḩ', 'h'), 'ḥ', 'h'), 'ȟ', 'h'), 'ḧ', 'h'), 'ḣ', 'h'), 'ĥ', 'h'), 'h', 'h'), 'ⓗ', 'h'), 'h', 'h'), 'ꝿ', 'g'), 'ᵹ', 'g'), 'ꞡ', 'g'), 'ɠ', 'g'), 'ǥ', 'g'), 'ģ', 'g'), 'ǧ', 'g'), 'ġ', 'g'), 'ğ', 'g'), 'ḡ', 'g'), 'ĝ', 'g'), 'ǵ', 'g'), 'g', 'g'), 'ⓖ', 'g'), 'g', 'g'), 'ꝼ', 'f'), 'ƒ', 'f'), 'ḟ', 'f'), 'f', 'f'), 'ⓕ', 'f'), 'f', 'f'), 'ǝ', 'e'), 'ɛ', 'e'), 'ɇ', 'e'), 'ḛ', 'e'), 'ḙ', 'e'), 'ę', 'e'), 'ḝ', 'e'), 'ȩ', 'e'), 'ệ', 'e'), 'ẹ', 'e'), 'ȇ', 'e'), 'ȅ', 'e'), 'ě', 'e'), 'ẻ', 'e'), 'ë', 'e'), 'ė', 'e'), 'ĕ', 'e'), 'ḗ', 'e'), 'ḕ', 'e'), 'ē', 'e'), 'ẽ', 'e'), 'ể', 'e'), 'ễ', 'e'), 'ế', 'e'), 'ề', 'e'), 'ê', 'e'), 'é', 'e'), 'è', 'e'), 'e', 'e'), 'ⓔ', 'e'), 'e', 'e'), 'ꝺ', 'd'), 'ɗ', 'd'), 'ɖ', 'd'), 'ƌ', 'd'), 'đ', 'd'), 'ḏ', 'd'), 'ḓ', 'd'), 'ḑ', 'd'), 'ḍ', 'd'), 'ď', 'd'), 'ḋ', 'd'), 'd', 'd'), 'ⓓ', 'd'), 'd', 'd'), 'ↄ', 'c'), 'ꜿ', 'c'), 'ȼ', 'c'), 'ƈ', 'c'), 'ḉ', 'c'), 'ç', 'c'), 'č', 'c'), 'ċ', 'c'), 'ĉ', 'c'), 'ć', 'c'), 'c', 'c'), 'ⓒ', 'c'), 'c', 'c'), 'ɓ', 'b'), 'ƃ', 'b'), 'ƀ', 'b'), 'ḇ', 'b'), 'ḅ', 'b'), 'ḃ', 'b'), 'b', 'b'), 'ⓑ', 'b'), 'b', 'b'), 'ɐ', 'a'), 'ⱥ', 'a'), 'ą', 'a'), 'ḁ', 'a'), 'ặ', 'a'), 'ậ', 'a'), 'ạ', 'a'), 'ȃ', 'a'), 'ȁ', 'a'), 'ǎ', 'a'), 'ǻ', 'a'), 'å', 'a'), 'ả', 'a'), 'ǟ', 'a'), 'ä', 'a'), 'ǡ', 'a'), 'ȧ', 'a'), 'ẳ', 'a'), 'ẵ', 'a'), 'ắ', 'a'), 'ằ', 'a'), 'ă', 'a'), 'ā', 'a'), 'ã', 'a'), 'ẩ', 'a'), 'ẫ', 'a'), 'ấ', 'a'), 'ầ', 'a'), 'â', 'a'), 'á', 'a'), 'à', 'a'), 'ẚ', 'a'), 'a', 'a'), 'ⓐ', 'a'), 'a', 'a'), 'Ꝣ', 'Z'), 'Ⱬ', 'Z'), 'Ɀ', 'Z'), 'Ȥ', 'Z'), 'Ƶ', 'Z'), 'Ẕ', 'Z'), 'Ẓ', 'Z'), 'Ž', 'Z'), 'Ż', 'Z'), 'Ẑ', 'Z'), 'Ź', 'Z'), 'Z', 'Z'), 'Ⓩ', 'Z'), 'Z', 'Z'), 'Ỿ', 'Y'), 'Ɏ', 'Y'), 'Ƴ', 'Y'), 'Ỵ', 'Y'), 'Ỷ', 'Y'), 'Ÿ', 'Y'), 'Ẏ', 'Y'), 'Ȳ', 'Y'), 'Ỹ', 'Y'), 'Ŷ', 'Y'), 'Ý', 'Y'), 'Ỳ', 'Y'), 'Y', 'Y'), 'Ⓨ', 'Y'), 'Y', 'Y'), 'Ẍ', 'X'), 'Ẋ', 'X'), 'X', 'X'), 'Ⓧ', 'X'), 'X', 'X'), 'Ⱳ', 'W'), 'Ẉ', 'W'), 'Ẅ', 'W'), 'Ẇ', 'W'), 'Ŵ', 'W'), 'Ẃ', 'W'), 'Ẁ', 'W'), 'W', 'W'), 'Ⓦ', 'W'), 'W', 'W'), 'Ʌ', 'V'), 'Ꝟ', 'V'), 'Ʋ', 'V'), 'Ṿ', 'V'), 'Ṽ', 'V'), 'V', 'V'), 'Ⓥ', 'V'), 'V', 'V'), 'Ʉ', 'U'), 'Ṵ', 'U'), 'Ṷ', 'U'), 'Ų', 'U'), 'Ṳ', 'U'), 'Ụ', 'U'), 'Ự', 'U'), 'Ử', 'U'), 'Ữ', 'U'), 'Ứ', 'U'), 'Ừ', 'U'), 'Ư', 'U'), 'Ȗ', 'U'), 'Ȕ', 'U'), 'Ǔ', 'U'), 'Ű', 'U'), 'Ů', 'U'), 'Ủ', 'U'), 'Ǚ', 'U'), 'Ǖ', 'U'), 'Ǘ', 'U'), 'Ǜ', 'U'), 'Ü', 'U'), 'Ŭ', 'U'), 'Ṻ', 'U'), 'Ū', 'U'), 'Ṹ', 'U'), 'Ũ', 'U'), 'Û', 'U'), 'Ú', 'U'), 'Ù', 'U'), 'U', 'U'), 'Ⓤ', 'U'), 'U', 'U'), 'Ꞇ', 'T'), 'Ⱦ', 'T'), 'Ʈ', 'T'), 'Ƭ', 'T'), 'Ŧ', 'T'), 'Ṯ', 'T'), 'Ṱ', 'T'), 'Ţ', 'T'), 'Ț', 'T'), 'Ṭ', 'T'), 'Ť', 'T'), 'Ṫ', 'T'), 'T', 'T'), 'Ⓣ', 'T'), 'T', 'T'), 'Ꞅ', 'S'), 'Ꞩ', 'S'), 'Ȿ', 'S'), 'Ş', 'S'), 'Ș', 'S'), 'Ṩ', 'S'), 'Ṣ', 'S'), 'Ṧ', 'S'), 'Š', 'S'), 'Ṡ', 'S'), 'Ŝ', 'S'), 'Ṥ', 'S'), 'Ś', 'S'), 'ẞ', 'S'), 'S', 'S'), 'Ⓢ', 'S'), 'S', 'S'), 'Ꞃ', 'R'), 'Ꞧ', 'R'), 'Ꝛ', 'R'), 'Ɽ', 'R'), 'Ɍ', 'R'), 'Ṟ', 'R'), 'Ŗ', 'R'), 'Ṝ', 'R'), 'Ṛ', 'R'), 'Ȓ', 'R'), 'Ȑ', 'R'), 'Ř', 'R'), 'Ṙ', 'R'), 'Ŕ', 'R'), 'R', 'R'), 'Ⓡ', 'R'), 'R', 'R'), 'Ɋ', 'Q'), 'Ꝙ', 'Q'), 'Ꝗ', 'Q'), 'Q', 'Q'), 'Ⓠ', 'Q'), 'Q', 'Q'), 'Ꝕ', 'P'), 'Ꝓ', 'P'), 'Ꝑ', 'P'), 'Ᵽ', 'P'), 'Ƥ', 'P'), 'Ṗ', 'P'), 'Ṕ', 'P'), 'P', 'P'), 'Ⓟ', 'P'), 'P', 'P'), 'Ꝍ', 'O'), 'Ꝋ', 'O'), 'Ɵ', 'O'), 'Ɔ', 'O'), 'Ǿ', 'O'), 'Ø', 'O'), 'Ǭ', 'O'), 'Ǫ', 'O'), 'Ộ', 'O'), 'Ọ', 'O'), 'Ợ', 'O'), 'Ở', 'O'), 'Ỡ', 'O'), 'Ớ', 'O'), 'Ờ', 'O'), 'Ơ', 'O'), 'Ȏ', 'O'), 'Ȍ', 'O'), 'Ǒ', 'O'), 'Ő', 'O'), 'Ỏ', 'O'), 'Ȫ', 'O'), 'Ö', 'O'), 'Ȱ', 'O'), 'Ȯ', 'O'), 'Ŏ', 'O'), 'Ṓ', 'O'), 'Ṑ', 'O'), 'Ō', 'O'), 'Ṏ', 'O'), 'Ȭ', 'O'), 'Ṍ', 'O'), 'Õ', 'O'), 'Ổ', 'O'), 'Ỗ', 'O'), 'Ố', 'O'), 'Ồ', 'O'), 'Ô', 'O'), 'Ó', 'O'), 'Ò', 'O'), 'O', 'O'), 'Ⓞ', 'O'), 'O', 'O'), 'Ꞥ', 'N'), 'Ꞑ', 'N'), 'Ɲ', 'N'), 'Ƞ', 'N'), 'Ṉ', 'N'), 'Ṋ', 'N'), 'Ņ', 'N'), 'Ṇ', 'N'), 'Ň', 'N'), 'Ṅ', 'N'), 'Ñ', 'N'), 'Ń', 'N'), 'Ǹ', 'N'), 'N', 'N'), 'Ⓝ', 'N'), 'N', 'N'), 'Ɯ', 'M'), 'Ɱ', 'M'), 'Ṃ', 'M'), 'Ṁ', 'M'), 'Ḿ', 'M'), 'M', 'M'), 'Ⓜ', 'M'), 'M', 'M'), 'Ꞁ', 'L'), 'Ꝇ', 'L'), 'Ꝉ', 'L'), 'Ⱡ', 'L'), 'Ɫ', 'L'), 'Ƚ', 'L'), 'Ł', 'L'), 'Ḻ', 'L'), 'Ḽ', 'L'), 'Ļ', 'L'), 'Ḹ', 'L'), 'Ḷ', 'L'), 'Ľ', 'L'), 'Ĺ', 'L'), 'Ŀ', 'L'), 'L', 'L'), 'Ⓛ', 'L'), 'L', 'L'), 'Ꞣ', 'K'), 'Ꝅ', 'K'), 'Ꝃ', 'K'), 'Ꝁ', 'K'), 'Ⱪ', 'K'), 'Ƙ', 'K'), 'Ḵ', 'K'), 'Ķ', 'K'), 'Ḳ', 'K'), 'Ǩ', 'K'), 'Ḱ', 'K'), 'K', 'K'), 'Ⓚ', 'K'), 'K', 'K'), 'Ɉ', 'J'), 'Ĵ', 'J'), 'J', 'J'), 'Ⓙ', 'J'), 'J', 'J'), 'Ɨ', 'I'), 'Ḭ', 'I'), 'Į', 'I'), 'Ị', 'I'), 'Ȋ', 'I'), 'Ȉ', 'I'), 'Ǐ', 'I'), 'Ỉ', 'I'), 'Ḯ', 'I'), 'Ï', 'I'), 'İ', 'I'), 'Ĭ', 'I'), 'Ī', 'I'), 'Ĩ', 'I'), 'Î', 'I'), 'Í', 'I'), 'Ì', 'I'), 'I', 'I'), 'Ⓘ', 'I'), 'I', 'I'), 'Ɥ', 'H'), 'Ⱶ', 'H'), 'Ⱨ', 'H'), 'Ħ', 'H'), 'Ḫ', 'H'), 'Ḩ', 'H'), 'Ḥ', 'H'), 'Ȟ', 'H'), 'Ḧ', 'H'), 'Ḣ', 'H'), 'Ĥ', 'H'), 'H', 'H'), 'Ⓗ', 'H'), 'H', 'H'), 'Ꝿ', 'G'), 'Ᵹ', 'G'), 'Ꞡ', 'G'), 'Ɠ', 'G'), 'Ǥ', 'G'), 'Ģ', 'G'), 'Ǧ', 'G'), 'Ġ', 'G'), 'Ğ', 'G'), 'Ḡ', 'G'), 'Ĝ', 'G'), 'Ǵ', 'G'), 'G', 'G'), 'Ⓖ', 'G'), 'G', 'G'), 'Ꝼ', 'F'), 'Ƒ', 'F'), 'Ḟ', 'F'), 'F', 'F'), 'Ⓕ', 'F'), 'F', 'F'), 'Ǝ', 'E'), 'Ɛ', 'E'), 'Ḛ', 'E'), 'Ḙ', 'E'), 'Ę', 'E'), 'Ḝ', 'E'), 'Ȩ', 'E'), 'Ệ', 'E'), 'Ẹ', 'E'), 'Ȇ', 'E'), 'Ȅ', 'E'), 'Ě', 'E'), 'Ẻ', 'E'), 'Ë', 'E'), 'Ė', 'E'), 'Ĕ', 'E'), 'Ḗ', 'E'), 'Ḕ', 'E'), 'Ē', 'E'), 'Ẽ', 'E'), 'Ể', 'E'), 'Ễ', 'E'), 'Ế', 'E'), 'Ề', 'E'), 'Ê', 'E'), 'É', 'E'), 'È', 'E'), 'E', 'E'), 'Ⓔ', 'E'), 'E', 'E'), 'Ð', 'D'), 'Ꝺ', 'D'), 'Ɖ', 'D'), 'Ɗ', 'D'), 'Ƌ', 'D'), 'Đ', 'D'), 'Ḏ', 'D'), 'Ḓ', 'D'), 'Ḑ', 'D'), 'Ḍ', 'D'), 'Ď', 'D'), 'Ḋ', 'D'), 'D', 'D'), 'Ⓓ', 'D'), 'D', 'D'), 'Ꜿ', 'C'), 'Ȼ', 'C'), 'Ƈ', 'C'), 'Ḉ', 'C'), 'Ç', 'C'), 'Č', 'C'), 'Ċ', 'C'), 'Ĉ', 'C'), 'Ć', 'C'), 'C', 'C'), 'Ⓒ', 'C'), 'C', 'C'), 'Ɓ', 'B'), 'Ƃ', 'B'), 'Ƀ', 'B'), 'Ḇ', 'B'), 'Ḅ', 'B'), 'Ḃ', 'B'), 'B', 'B'), 'Ⓑ', 'B'), 'B', 'B'), 'Ɐ', 'A'), 'Ⱥ', 'A'), 'Ą', 'A'), 'Ḁ', 'A'), 'Ặ', 'A'), 'Ậ', 'A'), 'Ạ', 'A'), 'Ȃ', 'A'), 'Ȁ', 'A'), 'Ǎ', 'A'), 'Ǻ', 'A'), 'Å', 'A'), 'Ả', 'A'), 'Ǟ', 'A'), 'Ä', 'A'), 'Ǡ', 'A'), 'Ȧ', 'A'), 'Ẳ', 'A'), 'Ẵ', 'A'), 'Ắ', 'A'), 'Ằ', 'A'), 'Ă', 'A'), 'Ā', 'A'), 'Ã', 'A'), 'Ẩ', 'A'), 'Ẫ', 'A'), 'Ấ', 'A'), 'Ầ', 'A'), 'Â', 'A'), 'Á', 'A'), 'À', 'A'), 'A', 'A'), 'Ⓐ', 'A'), 'A', 'A')

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript

        {'base':'A', 'letters':'\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'},
        {'base':'AA','letters':'\uA732'},
        {'base':'AE','letters':'\u00C6\u01FC\u01E2'},
        {'base':'AO','letters':'\uA734'},
        {'base':'AU','letters':'\uA736'},
        {'base':'AV','letters':'\uA738\uA73A'},
        {'base':'AY','letters':'\uA73C'},
        {'base':'B', 'letters':'\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181'},
        {'base':'C', 'letters':'\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E'},
        {'base':'D', 'letters':'\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779\u00D0'},
        {'base':'DZ','letters':'\u01F1\u01C4'},
        {'base':'Dz','letters':'\u01F2\u01C5'},
        {'base':'E', 'letters':'\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E'},
        {'base':'F', 'letters':'\u0046\u24BB\uFF26\u1E1E\u0191\uA77B'},
        {'base':'G', 'letters':'\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E'},
        {'base':'H', 'letters':'\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D'},
        {'base':'I', 'letters':'\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197'},
        {'base':'J', 'letters':'\u004A\u24BF\uFF2A\u0134\u0248'},
        {'base':'K', 'letters':'\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2'},
        {'base':'L', 'letters':'\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780'},
        {'base':'LJ','letters':'\u01C7'},
        {'base':'Lj','letters':'\u01C8'},
        {'base':'M', 'letters':'\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C'},
        {'base':'N', 'letters':'\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4'},
        {'base':'NJ','letters':'\u01CA'},
        {'base':'Nj','letters':'\u01CB'},
        {'base':'O', 'letters':'\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C'},
        {'base':'OI','letters':'\u01A2'},
        {'base':'OO','letters':'\uA74E'},
        {'base':'OU','letters':'\u0222'},
        {'base':'OE','letters':'\u008C\u0152'},
        {'base':'oe','letters':'\u009C\u0153'},
        {'base':'P', 'letters':'\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754'},
        {'base':'Q', 'letters':'\u0051\u24C6\uFF31\uA756\uA758\u024A'},
        {'base':'R', 'letters':'\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782'},
        {'base':'S', 'letters':'\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784'},
        {'base':'T', 'letters':'\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786'},
        {'base':'TZ','letters':'\uA728'},
        {'base':'U', 'letters':'\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244'},
        {'base':'V', 'letters':'\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245'},
        {'base':'VY','letters':'\uA760'},
        {'base':'W', 'letters':'\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72'},
        {'base':'X', 'letters':'\u0058\u24CD\uFF38\u1E8A\u1E8C'},
        {'base':'Y', 'letters':'\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE'},
        {'base':'Z', 'letters':'\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762'},
        {'base':'a', 'letters':'\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250'},
        {'base':'aa','letters':'\uA733'},
        {'base':'ae','letters':'\u00E6\u01FD\u01E3'},
        {'base':'ao','letters':'\uA735'},
        {'base':'au','letters':'\uA737'},
        {'base':'av','letters':'\uA739\uA73B'},
        {'base':'ay','letters':'\uA73D'},
        {'base':'b', 'letters':'\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253'},
        {'base':'c', 'letters':'\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184'},
        {'base':'d', 'letters':'\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A'},
        {'base':'dz','letters':'\u01F3\u01C6'},
        {'base':'e', 'letters':'\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD'},
        {'base':'f', 'letters':'\u0066\u24D5\uFF46\u1E1F\u0192\uA77C'},
        {'base':'g', 'letters':'\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F'},
        {'base':'h', 'letters':'\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265'},
        {'base':'hv','letters':'\u0195'},
        {'base':'i', 'letters':'\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131'},
        {'base':'j', 'letters':'\u006A\u24D9\uFF4A\u0135\u01F0\u0249'},
        {'base':'k', 'letters':'\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3'},
        {'base':'l', 'letters':'\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747'},
        {'base':'lj','letters':'\u01C9'},
        {'base':'m', 'letters':'\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F'},
        {'base':'n', 'letters':'\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5'},
        {'base':'nj','letters':'\u01CC'},
        {'base':'o', 'letters':'\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275'},
        {'base':'oi','letters':'\u01A3'},
        {'base':'ou','letters':'\u0223'},
        {'base':'oo','letters':'\uA74F'},
        {'base':'p','letters':'\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755'},
        {'base':'q','letters':'\u0071\u24E0\uFF51\u024B\uA757\uA759'},
        {'base':'r','letters':'\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783'},
        {'base':'s','letters':'\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B'},
        {'base':'t','letters':'\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787'},
        {'base':'tz','letters':'\uA729'},
        {'base':'u','letters': '\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289'},
        {'base':'v','letters':'\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C'},
        {'base':'vy','letters':'\uA761'},
        {'base':'w','letters':'\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73'},
        {'base':'x','letters':'\u0078\u24E7\uFF58\u1E8B\u1E8D'},
        {'base':'y','letters':'\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF'},
        {'base':'z','letters':'\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763'}
\s+{'base':'([^']+)',\s*'letters':\s*'([^']+?)'}.*
  c("$1", "$2") + \
  c("A", "\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F") + \
  c("AA", "\uA732") + \
  c("AE", "\u00C6\u01FC\u01E2") + \
  c("AO", "\uA734") + \
  c("AU", "\uA736") + \
  c("AV", "\uA738\uA73A") + \
  c("AY", "\uA73C") + \
  c("B", "\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181") + \
  c("C", "\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E") + \
  c("D", "\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779\u00D0") + \
  c("DZ", "\u01F1\u01C4") + \
  c("Dz", "\u01F2\u01C5") + \
  c("E", "\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E") + \
  c("F", "\u0046\u24BB\uFF26\u1E1E\u0191\uA77B") + \
  c("G", "\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E") + \
  c("H", "\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D") + \
  c("I", "\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197") + \
  c("J", "\u004A\u24BF\uFF2A\u0134\u0248") + \
  c("K", "\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2") + \
  c("L", "\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780") + \
  c("LJ", "\u01C7") + \
  c("Lj", "\u01C8") + \
  c("M", "\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C") + \
  c("N", "\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4") + \
  c("NJ", "\u01CA") + \
  c("Nj", "\u01CB") + \
  c("O", "\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C") + \
  c("OI", "\u01A2") + \
  c("OO", "\uA74E") + \
  c("OU", "\u0222") + \
  c("OE", "\u008C\u0152") + \
  c("oe", "\u009C\u0153") + \
  c("P", "\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754") + \
  c("Q", "\u0051\u24C6\uFF31\uA756\uA758\u024A") + \
  c("R", "\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782") + \
  c("S", "\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784") + \
  c("T", "\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786") + \
  c("TZ", "\uA728") + \
  c("U", "\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244") + \
  c("V", "\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245") + \
  c("VY", "\uA760") + \
  c("W", "\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72") + \
  c("X", "\u0058\u24CD\uFF38\u1E8A\u1E8C") + \
  c("Y", "\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE") + \
  c("Z", "\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762") + \
  c("a", "\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250") + \
  c("aa", "\uA733") + \
  c("ae", "\u00E6\u01FD\u01E3") + \
  c("ao", "\uA735") + \
  c("au", "\uA737") + \
  c("av", "\uA739\uA73B") + \
  c("ay", "\uA73D") + \
  c("b", "\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253") + \
  c("c", "\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184") + \
  c("d", "\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A") + \
  c("dz", "\u01F3\u01C6") + \
  c("e", "\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD") + \
  c("f", "\u0066\u24D5\uFF46\u1E1F\u0192\uA77C") + \
  c("g", "\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F") + \
  c("h", "\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265") + \
  c("hv", "\u0195") + \
  c("i", "\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131") + \
  c("j", "\u006A\u24D9\uFF4A\u0135\u01F0\u0249") + \
  c("k", "\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3") + \
  c("l", "\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747") + \
  c("lj", "\u01C9") + \
  c("m", "\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F") + \
  c("n", "\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5") + \
  c("nj", "\u01CC") + \
  c("o", "\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275") + \
  c("oi", "\u01A3") + \
  c("ou", "\u0223") + \
  c("oo", "\uA74F") + \
  c("p", "\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755") + \
  c("q", "\u0071\u24E0\uFF51\u024B\uA757\uA759") + \
  c("r", "\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783") + \
  c("s", "\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B") + \
  c("t", "\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787") + \
  c("tz", "\uA729") + \
  c("u", "\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289") + \
  c("v", "\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C") + \
  c("vy", "\uA761") + \
  c("w", "\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73") + \
  c("x", "\u0078\u24E7\uFF58\u1E8B\u1E8D") + \
  c("y", "\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF") + \
  c("z", "\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763") + \

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(c2) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') AS agg_c3 FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

動かない…

@rubyu
Copy link
Author

rubyu commented Jul 2, 2019

csvq -o SRD.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.dump.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

ダメ…

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "SELECT c2, LISTAGG(c3, '-') FROM test.tsv GROUP BY c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM test.tsv) GROUP BY lower_c2"
OK

csvq -no-header "SELECT lower_c2, LISTAGG(c3, '-') FROM (SELECT LOWER(c2) AS lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM test.tsv)) GROUP BY lower_c2"
OK

あれ…?

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

grep -E "\tA" SRD.dump.tsv > SRD.A.tsv

csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

再現した

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '==connected==') WITHIN GROUP (ORDER BY LEN(c3) ASC) FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2, c3 ORDER BY lower_c2 ASC"

rm SRD.A.distinct.sort.agg.tsv; csvq -o SRD.A.distinct.sort.agg.tsv -f TSV -without-header -no-header "SELECT lower_c2 , LISTAGG(c3, '-') FROM (SELECT LOWER(TRIM(c2)) as lower_c2, c3 FROM (SELECT DISTINCT c2, c3 FROM SRD.A.tsv)) GROUP BY lower_c2 ORDER BY lower_c2 ASC"

OK!!!!
GROUP BYの誤用だった…

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

dump dump1 get get1 get2
のような、末尾に数字がついてる系の見出しをうまく拾えない

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

1	h1	d1
2	h1	d2
3	h2	d3
4	h3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"

+----+--------------+
| c2 | tail_trimmed |
+----+--------------+
| h1 | h |
| h2 | h |
| h3 | h |
+----+--------------+

csvq のSUBSTRは0オリジン…?

1	ha1	d1
2	haa1	d2
3	haaa2	d3
4	haaaa3	d3

csvq -no-header "
SELECT
c2,
SUBSTR(c2, 0,
CASE SUBSTR(c2, LEN(c2) - 1, 1)
WHEN '1' THEN LEN(c2) - 1
WHEN '2' THEN LEN(c2) - 1
WHEN '3' THEN LEN(c2) - 1
WHEN '4' THEN LEN(c2) - 1
WHEN '5' THEN LEN(c2) - 1
WHEN '6' THEN LEN(c2) - 1
WHEN '7' THEN LEN(c2) - 1
WHEN '8' THEN LEN(c2) - 1
WHEN '9' THEN LEN(c2) - 1
ELSE LEN(c2)
END) AS tail_trimmed
FROM test.tsv
GROUP BY c2"
+--------+--------------+
| c2 | tail_trimmed |
+--------+--------------+
| ha1 | ha |
| haa1 | haa |
| haaa2 | haaa |
| haaaa3 | haaaa |
+--------+--------------+

@rubyu
Copy link
Author

rubyu commented Jul 4, 2019

csvq "SELECT INSTR('foo@example.com', '@');"

+-------------------------------+
| INSTR('foo@example.com', '@') |
+-------------------------------+
|                             3 |
+-------------------------------+

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

BQでやる

Google Driveにgzを置いて、外部テーブルとしてBQで設定

INSERT INTO `test.srd`
SELECT * FROM `test.test`

@rubyu
Copy link
Author

rubyu commented Jul 5, 2019

インデックスが0x00で開始されてるものだけフィルタすると、0xFFFDが含まれてるものをフィルタしたのより増加した。これは想定に反してる

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment