Last active
May 24, 2020 22:11
-
-
Save 7shi/8ce48804556420f18d0fa1fcead6fa56 to your computer and use it in GitHub Desktop.
[py][sqlite3] Tatoeba
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
usage = "[-n max sentences (default 10)] [-m max translations] languages..." | |
import tatoeba, sys | |
argi = 0 | |
def getarg(): | |
global argi | |
if argi + 1 >= len(sys.argv): return None | |
argi += 1 | |
return sys.argv[argi] | |
def getargs(): | |
return sys.argv[argi:] | |
n, m = 10, None | |
arg = getarg() | |
if arg == "-n": | |
n = int(getarg()) | |
arg = getarg() | |
if arg == "-m": | |
m = int(getarg()) | |
arg = getarg() | |
if not arg: | |
print("usage: %s %s" % (sys.argv[0], usage)) | |
exit(1) | |
tatoeba.shows(*getargs(), num=n, max=m) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
-- usage: sqlite3 tatoeba.db ".read mkdb.sql" | |
-- CSV files: https://tatoeba.org/jpn/downloads | |
.mode ascii | |
.separator "\t" "\n" | |
.print Importing \'sentences.csv\'... | |
CREATE TABLE sentences(id INTEGER, lang CHAR(8), text TEXT); | |
.import sentences.csv sentences | |
.print Creating \'langs\'... | |
CREATE TABLE langs(lid INTEGER PRIMARY KEY, name CHAR(8)); | |
INSERT INTO langs(name) | |
SELECT DISTINCT lang FROM sentences ORDER BY lang; | |
CREATE INDEX langs_lang_idx ON langs(name); | |
.print Creating \'slang\'... | |
CREATE TABLE slang(id INTEGER PRIMARY KEY, lang INTEGER); | |
INSERT INTO slang | |
SELECT id, lid FROM sentences | |
INNER JOIN langs ON lang = name; | |
CREATE INDEX slang_lang_idx ON slang(lang); | |
.print Creating \'stext\'... | |
CREATE TABLE stext(id INTEGER PRIMARY KEY, text TEXT); | |
INSERT INTO stext SELECT id, text FROM sentences; | |
.print Dropping \'sentences\'... | |
DROP TABLE sentences; | |
.print Vacuuming... | |
VACUUM; | |
.print Importing \'links.csv\'... | |
CREATE TABLE links(id INTEGER, link INTEGER); | |
.import links.csv links | |
CREATE INDEX links_id_idx ON links(id); | |
.print Creating \'llang\'... | |
CREATE TABLE llang(id INTEGER, lang INTEGER); | |
INSERT INTO llang | |
SELECT * FROM slang | |
UNION | |
SELECT DISTINCT links.id AS id, lang FROM links | |
INNER JOIN slang ON link = slang.id; | |
CREATE INDEX llang_id_idx ON llang(id); | |
CREATE INDEX llang_lang_idx ON llang(lang); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
import functools, itertools, sqlite3, sys | |
conn = sqlite3.connect("tatoeba.db") | |
cur = conn.cursor() | |
lids = {} | |
for lid, name in cur.execute("SELECT * FROM langs"): | |
lids[name] = lid | |
langnames = [None] * (len(lids) + 1) | |
for name, lid in lids.items(): | |
langnames[lid] = name | |
def convlangs(languages): | |
ret = [lid for l in languages if (lid := lids.get(l))] | |
return ret, len(ret) | |
def lang(n): | |
result = cur.execute("SELECT lang FROM slang WHERE id = ?", (n,)).fetchone() | |
return langnames[result[0]] if result else None | |
def text(n): | |
result = cur.execute("SELECT text FROM stext WHERE id = ?", (n,)).fetchone() | |
return result[0] if result else None | |
def get(n): | |
l = lang(n) | |
return {"lang": l, "text": text(n)} if l else None | |
def langs(n): | |
cur.execute(""" | |
SELECT DISTINCT lang FROM links | |
INNER JOIN slang ON link = slang.id | |
WHERE links.id = ? ORDER BY lang | |
""", (n,)) | |
return [langnames[n[0]] for n in cur] | |
def link(n, *languages): | |
ls = list(languages) if languages else langs(n) | |
ret = {} | |
for l in ls: | |
lid = lids.get(l) | |
if not lid: | |
ret[l] = [] | |
continue | |
cur.execute(""" | |
SELECT link FROM links | |
INNER JOIN slang ON link = slang.id | |
WHERE links.id = ? AND slang.lang = ? | |
""", (n, lid)) | |
ret[l] = [n[0] for n in cur] | |
return ret | |
def translate(n, *languages): | |
ret = link(n, *languages) | |
l = lang(n) | |
if l in languages: ret[l] = [n] + ret[l] | |
for k in ret.keys(): | |
ret[k] = [(n, text(n)) for n in ret[k]] | |
return ret | |
def show(trs, max=None): | |
if not trs: return | |
for k in trs.keys(): | |
tr = trs[k] | |
for i in range(len(tr)): | |
if max and i >= max: break | |
print(k + ":", tr[i][1]) | |
def haslink(n, *languages): | |
ls, lslen = convlangs(languages) | |
if lslen == 0 or lslen != len(languages): return False | |
result = cur.execute(""" | |
SELECT COUNT(*) FROM ( | |
SELECT DISTINCT lang FROM (SELECT link FROM links WHERE id = ?) | |
INNER JOIN slang ON link = slang.id AND lang IN (%s)) | |
""" % ",".join("?" * lslen), [n] + ls).fetchone() | |
return result and result[0] == lslen | |
@functools.lru_cache | |
def countlink(lid): | |
result = cur.execute(""" | |
SELECT COUNT(*) FROM llang WHERE lang = ? | |
""", (lid,)).fetchone() | |
return (lid, result[0] if result else 0) | |
def choose(*languages): | |
ls, lslen = convlangs(languages) | |
if lslen == 0 or lslen != len(languages): return [] | |
if lslen == 1: | |
cur.execute("SELECT id FROM slang WHERE lang = ?", (ls[0],)) | |
else: | |
ls = [t[0] for t in sorted(map(countlink, ls), key=lambda t: t[1])] | |
sql = "SELECT id1 as id FROM (SELECT id AS id1 FROM llang WHERE lang = ?)" | |
for i in range(1, len(ls)): | |
sql += "INNER JOIN llang l%d ON id1 = l%d.id AND l%d.lang = ?" % (i, i, i) | |
cur.execute(sql, ls) | |
return [n[0] for n in cur] | |
def shows(*ls, num=None, max=None): | |
ns = choose(*ls) | |
print("sentences:", len(ns)) | |
if num and len(ns) > num: ns = ns[:num] | |
for n in ns: | |
if len(ls) > 1: print() | |
print("#%d %s: %s" % (n, lang(n), text(n))) | |
if len(ls) > 1: show(translate(n, *ls), max) | |
if __name__ == "__main__": | |
def t(s, output=False): | |
sys.stdout.write(s) | |
sys.stdout.flush() | |
if output: print(" ->") | |
result = eval(s) | |
if not output: print(" ->", result) | |
testlangs = ["eng", "jpn", "pms"] | |
for f in ["lang", "text", "get", "langs", "link"]: | |
for n in [1, 0]: | |
t('%s(%d)' % (f, n)) | |
for l in testlangs: | |
t('link(1, "%s")' % l) | |
t('link(0, "eng")') | |
for n in [1, 0]: | |
t('translate(%d, "%s")' % (n, '", "'.join(testlangs))) | |
t('show(translate(1, "%s"), max=3)' % '", "'.join(testlangs), True) | |
for i in range(len(testlangs)): | |
for ls in itertools.combinations(testlangs, i + 1): | |
t('haslink(1, "%s")' % '", "'.join(ls)) | |
t('haslink(0, "eng")') | |
t('haslink(0, "eng", "jpn")') | |
for i in range(len(testlangs)): | |
for ls in itertools.combinations(testlangs, i + 1): | |
t('len(choose("%s"))' % '", "'.join(ls)) | |
t('choose("%s")' % '", "'.join(testlangs)) | |
t('shows("%s", num=2, max=3)' % '", "'.join(testlangs), True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
詳細は以下の記事を参照してください。