Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active May 24, 2020 22:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 7shi/8ce48804556420f18d0fa1fcead6fa56 to your computer and use it in GitHub Desktop.
Save 7shi/8ce48804556420f18d0fa1fcead6fa56 to your computer and use it in GitHub Desktop.
[py][sqlite3] Tatoeba
# CC0 http://creativecommons.org/publicdomain/zero/1.0/
usage = "[-n max sentences (default 10)] [-m max translations] languages..."
import tatoeba, sys
argi = 0
def getarg():
global argi
if argi + 1 >= len(sys.argv): return None
argi += 1
return sys.argv[argi]
def getargs():
return sys.argv[argi:]
n, m = 10, None
arg = getarg()
if arg == "-n":
n = int(getarg())
arg = getarg()
if arg == "-m":
m = int(getarg())
arg = getarg()
if not arg:
print("usage: %s %s" % (sys.argv[0], usage))
exit(1)
tatoeba.shows(*getargs(), num=n, max=m)
-- CC0 http://creativecommons.org/publicdomain/zero/1.0/
-- usage: sqlite3 tatoeba.db ".read mkdb.sql"
-- CSV files: https://tatoeba.org/jpn/downloads
.mode ascii
.separator "\t" "\n"
.print Importing \'sentences.csv\'...
CREATE TABLE sentences(id INTEGER, lang CHAR(8), text TEXT);
.import sentences.csv sentences
.print Creating \'langs\'...
CREATE TABLE langs(lid INTEGER PRIMARY KEY, name CHAR(8));
INSERT INTO langs(name)
SELECT DISTINCT lang FROM sentences ORDER BY lang;
CREATE INDEX langs_lang_idx ON langs(name);
.print Creating \'slang\'...
CREATE TABLE slang(id INTEGER PRIMARY KEY, lang INTEGER);
INSERT INTO slang
SELECT id, lid FROM sentences
INNER JOIN langs ON lang = name;
CREATE INDEX slang_lang_idx ON slang(lang);
.print Creating \'stext\'...
CREATE TABLE stext(id INTEGER PRIMARY KEY, text TEXT);
INSERT INTO stext SELECT id, text FROM sentences;
.print Dropping \'sentences\'...
DROP TABLE sentences;
.print Vacuuming...
VACUUM;
.print Importing \'links.csv\'...
CREATE TABLE links(id INTEGER, link INTEGER);
.import links.csv links
CREATE INDEX links_id_idx ON links(id);
.print Creating \'llang\'...
CREATE TABLE llang(id INTEGER, lang INTEGER);
INSERT INTO llang
SELECT * FROM slang
UNION
SELECT DISTINCT links.id AS id, lang FROM links
INNER JOIN slang ON link = slang.id;
CREATE INDEX llang_id_idx ON llang(id);
CREATE INDEX llang_lang_idx ON llang(lang);
# CC0 http://creativecommons.org/publicdomain/zero/1.0/
import functools, itertools, sqlite3, sys
conn = sqlite3.connect("tatoeba.db")
cur = conn.cursor()
lids = {}
for lid, name in cur.execute("SELECT * FROM langs"):
lids[name] = lid
langnames = [None] * (len(lids) + 1)
for name, lid in lids.items():
langnames[lid] = name
def convlangs(languages):
ret = [lid for l in languages if (lid := lids.get(l))]
return ret, len(ret)
def lang(n):
result = cur.execute("SELECT lang FROM slang WHERE id = ?", (n,)).fetchone()
return langnames[result[0]] if result else None
def text(n):
result = cur.execute("SELECT text FROM stext WHERE id = ?", (n,)).fetchone()
return result[0] if result else None
def get(n):
l = lang(n)
return {"lang": l, "text": text(n)} if l else None
def langs(n):
cur.execute("""
SELECT DISTINCT lang FROM links
INNER JOIN slang ON link = slang.id
WHERE links.id = ? ORDER BY lang
""", (n,))
return [langnames[n[0]] for n in cur]
def link(n, *languages):
ls = list(languages) if languages else langs(n)
ret = {}
for l in ls:
lid = lids.get(l)
if not lid:
ret[l] = []
continue
cur.execute("""
SELECT link FROM links
INNER JOIN slang ON link = slang.id
WHERE links.id = ? AND slang.lang = ?
""", (n, lid))
ret[l] = [n[0] for n in cur]
return ret
def translate(n, *languages):
ret = link(n, *languages)
l = lang(n)
if l in languages: ret[l] = [n] + ret[l]
for k in ret.keys():
ret[k] = [(n, text(n)) for n in ret[k]]
return ret
def show(trs, max=None):
if not trs: return
for k in trs.keys():
tr = trs[k]
for i in range(len(tr)):
if max and i >= max: break
print(k + ":", tr[i][1])
def haslink(n, *languages):
ls, lslen = convlangs(languages)
if lslen == 0 or lslen != len(languages): return False
result = cur.execute("""
SELECT COUNT(*) FROM (
SELECT DISTINCT lang FROM (SELECT link FROM links WHERE id = ?)
INNER JOIN slang ON link = slang.id AND lang IN (%s))
""" % ",".join("?" * lslen), [n] + ls).fetchone()
return result and result[0] == lslen
@functools.lru_cache
def countlink(lid):
result = cur.execute("""
SELECT COUNT(*) FROM llang WHERE lang = ?
""", (lid,)).fetchone()
return (lid, result[0] if result else 0)
def choose(*languages):
ls, lslen = convlangs(languages)
if lslen == 0 or lslen != len(languages): return []
if lslen == 1:
cur.execute("SELECT id FROM slang WHERE lang = ?", (ls[0],))
else:
ls = [t[0] for t in sorted(map(countlink, ls), key=lambda t: t[1])]
sql = "SELECT id1 as id FROM (SELECT id AS id1 FROM llang WHERE lang = ?)"
for i in range(1, len(ls)):
sql += "INNER JOIN llang l%d ON id1 = l%d.id AND l%d.lang = ?" % (i, i, i)
cur.execute(sql, ls)
return [n[0] for n in cur]
def shows(*ls, num=None, max=None):
ns = choose(*ls)
print("sentences:", len(ns))
if num and len(ns) > num: ns = ns[:num]
for n in ns:
if len(ls) > 1: print()
print("#%d %s: %s" % (n, lang(n), text(n)))
if len(ls) > 1: show(translate(n, *ls), max)
if __name__ == "__main__":
def t(s, output=False):
sys.stdout.write(s)
sys.stdout.flush()
if output: print(" ->")
result = eval(s)
if not output: print(" ->", result)
testlangs = ["eng", "jpn", "pms"]
for f in ["lang", "text", "get", "langs", "link"]:
for n in [1, 0]:
t('%s(%d)' % (f, n))
for l in testlangs:
t('link(1, "%s")' % l)
t('link(0, "eng")')
for n in [1, 0]:
t('translate(%d, "%s")' % (n, '", "'.join(testlangs)))
t('show(translate(1, "%s"), max=3)' % '", "'.join(testlangs), True)
for i in range(len(testlangs)):
for ls in itertools.combinations(testlangs, i + 1):
t('haslink(1, "%s")' % '", "'.join(ls))
t('haslink(0, "eng")')
t('haslink(0, "eng", "jpn")')
for i in range(len(testlangs)):
for ls in itertools.combinations(testlangs, i + 1):
t('len(choose("%s"))' % '", "'.join(ls))
t('choose("%s")' % '", "'.join(testlangs))
t('shows("%s", num=2, max=3)' % '", "'.join(testlangs), True)
@7shi
Copy link
Author

7shi commented May 24, 2020

詳細は以下の記事を参照してください。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment