Last active
August 20, 2022 13:24
-
-
Save patarapolw/9a0511427082b2fb89296d87b42467d9 to your computer and use it in GitHub Desktop.
Personalized Japanese sentence (from transcript) database, with searching; based on tokenization and POS tagging.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
from pathlib import Path | |
import regex | |
import sudachipy | |
ROOT = Path(__file__).parent | |
sudachi = sudachipy.Dictionary(dict_type="full").create() | |
class DB: | |
def __init__(self, db_path=ROOT / "idx.db") -> None: | |
self.db = sqlite3.connect(db_path) | |
self.db.row_factory = sqlite3.Row | |
self.db.executescript( | |
""" | |
CREATE TABLE IF NOT EXISTS "token" ( | |
id INTEGER NOT NULL, | |
dict TEXT NOT NULL, | |
pos TEXT NOT NULL, -- ' ' in back, front, and separated | |
PRIMARY KEY (id) | |
); | |
CREATE TABLE IF NOT EXISTS "files" ( | |
id INTEGER NOT NULL, | |
filename TEXT NOT NULL, | |
modified INT NOT NULL, | |
PRIMARY KEY (id) | |
); | |
CREATE UNIQUE INDEX IF NOT EXISTS idx_files_filename ON "files"(filename); | |
CREATE INDEX IF NOT EXISTS idx_files_modified ON "files"(modified); | |
CREATE TABLE IF NOT EXISTS "lines" ( | |
id INTEGER NOT NULL, | |
"time" TEXT NOT NULL, | |
"text" TEXT NOT NULL, | |
PRIMARY KEY (id) | |
); | |
CREATE TABLE IF NOT EXISTS files_lines ( | |
files_id INTEGER NOT NULL REFERENCES "files"(id) ON DELETE CASCADE, | |
lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE, | |
PRIMARY KEY (files_id, lines_id) | |
); | |
CREATE TABLE IF NOT EXISTS lines_token ( | |
lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE, | |
token_id INTEGER NOT NULL REFERENCES "token"(id) ON DELETE CASCADE, | |
PRIMARY KEY (lines_id, token_id) | |
); | |
""" | |
) | |
self._create() | |
def __enter__(self): | |
return self | |
def __exit__(self, exc_type, exc_value, exc_tb): | |
self.db.close() | |
def _create(self): | |
assets_folder = ROOT.parent / "assets" | |
re_srt = regex.compile(r"^(\d{2,}:\d{2}:\d{2},\d{3}) -->") | |
re_num = regex.compile(r"^\d+$") | |
re_ass = regex.compile(r"^(\d+:\d{2}:\d{2}\.\d{2}) ([^ ]+) (.+)$") | |
re_ja = regex.compile(r"[\p{Han}\p{Katakana}\p{Hiragana}]", regex.UNICODE) | |
for f in assets_folder.glob("**/*.*"): | |
filename = "/".join(f.relative_to(assets_folder).parts) | |
modified = f.stat().st_mtime_ns | |
for c in self.db.execute( | |
'SELECT modified FROM "files" WHERE filename = ?', | |
(filename,), | |
): | |
if c[0] >= modified: | |
modified = 0 | |
if not modified: | |
continue | |
self.db.execute('DELETE FROM "files" WHERE filename = ?', (filename,)) | |
files_id = self.db.execute( | |
'INSERT INTO "files" (filename, modified) VALUES (?, ?)', | |
(filename, modified), | |
).lastrowid | |
timestamp = "" | |
for r in f.open("r", encoding="utf-8"): | |
r = r.rstrip() | |
if not r: | |
continue | |
m = re_srt.match(r) | |
if m: | |
timestamp = m.group(1).replace(",", ".") | |
if re_num.match(r): | |
timestamp = "" | |
if not re_ja.search(r): | |
continue | |
m = re_ass.match(r) | |
if m: | |
timestamp = m.group(0) | |
r = m.group(2) | |
if not timestamp: | |
continue | |
if not r: | |
continue | |
lines_id = self.db.execute( | |
'INSERT INTO "lines" ("time", "text") VALUES (?, ?)', (timestamp, r) | |
).lastrowid | |
self.db.execute( | |
"INSERT INTO files_lines (files_id, lines_id) VALUES (?, ?)", | |
(files_id, lines_id), | |
) | |
for t in sudachi.tokenize(r): | |
token_id = self.db.execute( | |
'INSERT INTO "token" (dict, pos) VALUES (?, ?)', | |
( | |
t.dictionary_form(), | |
" " | |
+ " ".join(p for p in t.part_of_speech() if p != "*") | |
+ " ", | |
), | |
).lastrowid | |
self.db.execute( | |
"INSERT INTO lines_token (lines_id, token_id) VALUES (?, ?)", | |
(lines_id, token_id), | |
) | |
self.db.commit() | |
def search(self, q: str): | |
lines_id_map: dict[int, int] = {} | |
not_lines: set[str] = set() | |
token_no = 0 | |
for r in q.split("."): | |
if token_no: | |
next_map: dict[int, int] = {} | |
for c in self.db.execute( | |
""" | |
SELECT lines_id, token_id FROM lines_token | |
WHERE token_id IN (SELECT id FROM "token" WHERE pos LIKE '% 助詞 %') | |
{:s} | |
""".format( | |
f"AND lines_id NOT IN ({','.join(not_lines)})" | |
if len(not_lines) | |
else "", | |
) | |
): | |
if lines_id_map.get(c[0], -2) + 1 != c[1]: | |
not_lines.add(str(c[0])) | |
lines_id_map.pop(c[0], None) | |
else: | |
next_map[c[0]] = c[1] | |
lines_id_map = next_map | |
token_no += 1 | |
for t in sudachi.tokenize(r): | |
next_map: dict[int, int] = {} | |
for c in self.db.execute( | |
""" | |
SELECT lines_id, token_id FROM lines_token | |
WHERE token_id IN (SELECT id FROM "token" WHERE dict = ? {:s}) | |
{:s} | |
""".format( | |
"AND (" | |
+ " OR ".join( | |
f"pos LIKE '% {p} %'" | |
for p in t.part_of_speech() | |
if p != "*" | |
) | |
+ ")", | |
f"AND lines_id NOT IN ({','.join(not_lines)})" | |
if len(not_lines) | |
else "", | |
), | |
(t.dictionary_form(),), | |
): | |
if token_no and lines_id_map.get(c[0], -2) + 1 != c[1]: | |
not_lines.add(str(c[0])) | |
lines_id_map.pop(c[0], None) | |
else: | |
next_map[c[0]] = c[1] | |
lines_id_map = next_map | |
token_no += 1 | |
if len(lines_id_map): | |
return [ | |
dict(r) | |
for r in self.db.execute( | |
f""" | |
SELECT "text", "time", filename | |
FROM "lines" | |
JOIN files_lines fl ON fl.lines_id = lines.id | |
JOIN "files" ON fl.files_id = files.id | |
WHERE lines_id IN ({','.join(str(k) for k in lines_id_map.keys())}) | |
""" | |
) | |
] | |
return [] | |
def like(self, q: str): | |
return [ | |
dict(r) | |
for r in self.db.execute( | |
f""" | |
SELECT "text", "time", filename | |
FROM "lines" | |
JOIN files_lines fl ON fl.lines_id = lines.id | |
JOIN "files" ON fl.files_id = files.id | |
WHERE "text" LIKE '%'||?||'%' | |
""", | |
(q,), | |
) | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SudachiPy>=0.6.6 | |
SudachiDict-Full>=20220729 | |
regex>=2022.8.17 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
GitHub Gist somehow doesn't allow file-list naming, and arbitrarily decide the name based on alphabets?
Tested on Windows 11 with Python 3.10 (pyenv-win version, which downloads from Python Foundation, I think).