Skip to content

Instantly share code, notes, and snippets.

@patarapolw
Last active August 20, 2022 13:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save patarapolw/9a0511427082b2fb89296d87b42467d9 to your computer and use it in GitHub Desktop.
Save patarapolw/9a0511427082b2fb89296d87b42467d9 to your computer and use it in GitHub Desktop.
Personalized Japanese sentence (from transcript) database, with searching; based on tokenization and POS tagging.
import sqlite3
from pathlib import Path
import regex
import sudachipy
ROOT = Path(__file__).parent
sudachi = sudachipy.Dictionary(dict_type="full").create()
class DB:
def __init__(self, db_path=ROOT / "idx.db") -> None:
self.db = sqlite3.connect(db_path)
self.db.row_factory = sqlite3.Row
self.db.executescript(
"""
CREATE TABLE IF NOT EXISTS "token" (
id INTEGER NOT NULL,
dict TEXT NOT NULL,
pos TEXT NOT NULL, -- ' ' in back, front, and separated
PRIMARY KEY (id)
);
CREATE TABLE IF NOT EXISTS "files" (
id INTEGER NOT NULL,
filename TEXT NOT NULL,
modified INT NOT NULL,
PRIMARY KEY (id)
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_files_filename ON "files"(filename);
CREATE INDEX IF NOT EXISTS idx_files_modified ON "files"(modified);
CREATE TABLE IF NOT EXISTS "lines" (
id INTEGER NOT NULL,
"time" TEXT NOT NULL,
"text" TEXT NOT NULL,
PRIMARY KEY (id)
);
CREATE TABLE IF NOT EXISTS files_lines (
files_id INTEGER NOT NULL REFERENCES "files"(id) ON DELETE CASCADE,
lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
PRIMARY KEY (files_id, lines_id)
);
CREATE TABLE IF NOT EXISTS lines_token (
lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
token_id INTEGER NOT NULL REFERENCES "token"(id) ON DELETE CASCADE,
PRIMARY KEY (lines_id, token_id)
);
"""
)
self._create()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_tb):
self.db.close()
def _create(self):
assets_folder = ROOT.parent / "assets"
re_srt = regex.compile(r"^(\d{2,}:\d{2}:\d{2},\d{3}) -->")
re_num = regex.compile(r"^\d+$")
re_ass = regex.compile(r"^(\d+:\d{2}:\d{2}\.\d{2}) ([^ ]+) (.+)$")
re_ja = regex.compile(r"[\p{Han}\p{Katakana}\p{Hiragana}]", regex.UNICODE)
for f in assets_folder.glob("**/*.*"):
filename = "/".join(f.relative_to(assets_folder).parts)
modified = f.stat().st_mtime_ns
for c in self.db.execute(
'SELECT modified FROM "files" WHERE filename = ?',
(filename,),
):
if c[0] >= modified:
modified = 0
if not modified:
continue
self.db.execute('DELETE FROM "files" WHERE filename = ?', (filename,))
files_id = self.db.execute(
'INSERT INTO "files" (filename, modified) VALUES (?, ?)',
(filename, modified),
).lastrowid
timestamp = ""
for r in f.open("r", encoding="utf-8"):
r = r.rstrip()
if not r:
continue
m = re_srt.match(r)
if m:
timestamp = m.group(1).replace(",", ".")
if re_num.match(r):
timestamp = ""
if not re_ja.search(r):
continue
m = re_ass.match(r)
if m:
timestamp = m.group(0)
r = m.group(2)
if not timestamp:
continue
if not r:
continue
lines_id = self.db.execute(
'INSERT INTO "lines" ("time", "text") VALUES (?, ?)', (timestamp, r)
).lastrowid
self.db.execute(
"INSERT INTO files_lines (files_id, lines_id) VALUES (?, ?)",
(files_id, lines_id),
)
for t in sudachi.tokenize(r):
token_id = self.db.execute(
'INSERT INTO "token" (dict, pos) VALUES (?, ?)',
(
t.dictionary_form(),
" "
+ " ".join(p for p in t.part_of_speech() if p != "*")
+ " ",
),
).lastrowid
self.db.execute(
"INSERT INTO lines_token (lines_id, token_id) VALUES (?, ?)",
(lines_id, token_id),
)
self.db.commit()
def search(self, q: str):
lines_id_map: dict[int, int] = {}
not_lines: set[str] = set()
token_no = 0
for r in q.split("."):
if token_no:
next_map: dict[int, int] = {}
for c in self.db.execute(
"""
SELECT lines_id, token_id FROM lines_token
WHERE token_id IN (SELECT id FROM "token" WHERE pos LIKE '% 助詞 %')
{:s}
""".format(
f"AND lines_id NOT IN ({','.join(not_lines)})"
if len(not_lines)
else "",
)
):
if lines_id_map.get(c[0], -2) + 1 != c[1]:
not_lines.add(str(c[0]))
lines_id_map.pop(c[0], None)
else:
next_map[c[0]] = c[1]
lines_id_map = next_map
token_no += 1
for t in sudachi.tokenize(r):
next_map: dict[int, int] = {}
for c in self.db.execute(
"""
SELECT lines_id, token_id FROM lines_token
WHERE token_id IN (SELECT id FROM "token" WHERE dict = ? {:s})
{:s}
""".format(
"AND ("
+ " OR ".join(
f"pos LIKE '% {p} %'"
for p in t.part_of_speech()
if p != "*"
)
+ ")",
f"AND lines_id NOT IN ({','.join(not_lines)})"
if len(not_lines)
else "",
),
(t.dictionary_form(),),
):
if token_no and lines_id_map.get(c[0], -2) + 1 != c[1]:
not_lines.add(str(c[0]))
lines_id_map.pop(c[0], None)
else:
next_map[c[0]] = c[1]
lines_id_map = next_map
token_no += 1
if len(lines_id_map):
return [
dict(r)
for r in self.db.execute(
f"""
SELECT "text", "time", filename
FROM "lines"
JOIN files_lines fl ON fl.lines_id = lines.id
JOIN "files" ON fl.files_id = files.id
WHERE lines_id IN ({','.join(str(k) for k in lines_id_map.keys())})
"""
)
]
return []
def like(self, q: str):
return [
dict(r)
for r in self.db.execute(
f"""
SELECT "text", "time", filename
FROM "lines"
JOIN files_lines fl ON fl.lines_id = lines.id
JOIN "files" ON fl.files_id = files.id
WHERE "text" LIKE '%'||?||'%'
""",
(q,),
)
]
SudachiPy>=0.6.6
SudachiDict-Full>=20220729
regex>=2022.8.17
@patarapolw
Copy link
Author

GitHub Gist somehow doesn't allow file-list naming, and arbitrarily decide the name based on alphabets?

Tested on Windows 11 with Python 3.10 (pyenv-win version, which downloads from Python Foundation, I think).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment