patarapolw/jp-sentence-db.py

## jp-sentence-db.py
import sqlite3
from pathlib import Path

import regex
import sudachipy

ROOT = Path(__file__).parent
sudachi = sudachipy.Dictionary(dict_type="full").create()


class DB:
    def __init__(self, db_path=ROOT / "idx.db") -> None:
        self.db = sqlite3.connect(db_path)
        self.db.row_factory = sqlite3.Row

        self.db.executescript(
            """
        CREATE TABLE IF NOT EXISTS "token" (
            id          INTEGER NOT NULL,
            dict        TEXT NOT NULL,
            pos         TEXT NOT NULL, -- ' ' in back, front, and separated
            PRIMARY KEY (id)
        );

        CREATE TABLE IF NOT EXISTS "files" (
            id          INTEGER NOT NULL,
            filename    TEXT NOT NULL,
            modified    INT NOT NULL,
            PRIMARY KEY (id)
        );

        CREATE UNIQUE INDEX IF NOT EXISTS idx_files_filename ON "files"(filename);
        CREATE INDEX IF NOT EXISTS idx_files_modified ON "files"(modified);

        CREATE TABLE IF NOT EXISTS "lines" (
            id          INTEGER NOT NULL,
            "time"      TEXT NOT NULL,
            "text"      TEXT NOT NULL,
            PRIMARY KEY (id)
        );

        CREATE TABLE IF NOT EXISTS files_lines (
            files_id    INTEGER NOT NULL REFERENCES "files"(id) ON DELETE CASCADE,
            lines_id    INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
            PRIMARY KEY (files_id, lines_id)
        );

        CREATE TABLE IF NOT EXISTS lines_token (
            lines_id    INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
            token_id    INTEGER NOT NULL REFERENCES "token"(id) ON DELETE CASCADE,
            PRIMARY KEY (lines_id, token_id)
        );
        """
        )

        self._create()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_tb):
        self.db.close()

    def _create(self):
        assets_folder = ROOT.parent / "assets"
        re_srt = regex.compile(r"^(\d{2,}:\d{2}:\d{2},\d{3}) -->")
        re_num = regex.compile(r"^\d+$")
        re_ass = regex.compile(r"^(\d+:\d{2}:\d{2}\.\d{2}) ([^ ]+) (.+)$")
        re_ja = regex.compile(r"[\p{Han}\p{Katakana}\p{Hiragana}]", regex.UNICODE)

        for f in assets_folder.glob("**/*.*"):
            filename = "/".join(f.relative_to(assets_folder).parts)
            modified = f.stat().st_mtime_ns

            for c in self.db.execute(
                'SELECT modified FROM "files" WHERE filename = ?',
                (filename,),
            ):
                if c[0] >= modified:
                    modified = 0

            if not modified:
                continue

            self.db.execute('DELETE FROM "files" WHERE filename = ?', (filename,))

            files_id = self.db.execute(
                'INSERT INTO "files" (filename, modified) VALUES (?, ?)',
                (filename, modified),
            ).lastrowid

            timestamp = ""

            for r in f.open("r", encoding="utf-8"):
                r = r.rstrip()

                if not r:
                    continue

                m = re_srt.match(r)
                if m:
                    timestamp = m.group(1).replace(",", ".")

                if re_num.match(r):
                    timestamp = ""

                if not re_ja.search(r):
                    continue

                m = re_ass.match(r)
                if m:
                    timestamp = m.group(0)
                    r = m.group(2)

                if not timestamp:
                    continue

                if not r:
                    continue

                lines_id = self.db.execute(
                    'INSERT INTO "lines" ("time", "text") VALUES (?, ?)', (timestamp, r)
                ).lastrowid

                self.db.execute(
                    "INSERT INTO files_lines (files_id, lines_id) VALUES (?, ?)",
                    (files_id, lines_id),
                )

                for t in sudachi.tokenize(r):
                    token_id = self.db.execute(
                        'INSERT INTO "token" (dict, pos) VALUES (?, ?)',
                        (
                            t.dictionary_form(),
                            " "
                            + " ".join(p for p in t.part_of_speech() if p != "*")
                            + " ",
                        ),
                    ).lastrowid

                    self.db.execute(
                        "INSERT INTO lines_token (lines_id, token_id) VALUES (?, ?)",
                        (lines_id, token_id),
                    )

            self.db.commit()

    def search(self, q: str):
        lines_id_map: dict[int, int] = {}
        not_lines: set[str] = set()

        token_no = 0

        for r in q.split("."):
            if token_no:
                next_map: dict[int, int] = {}

                for c in self.db.execute(
                    """
                SELECT lines_id, token_id FROM lines_token
                WHERE token_id IN (SELECT id FROM "token" WHERE pos LIKE '% 助詞 %')
                {:s}
                """.format(
                        f"AND lines_id NOT IN ({','.join(not_lines)})"
                        if len(not_lines)
                        else "",
                    )
                ):
                    if lines_id_map.get(c[0], -2) + 1 != c[1]:
                        not_lines.add(str(c[0]))
                        lines_id_map.pop(c[0], None)
                    else:
                        next_map[c[0]] = c[1]

                lines_id_map = next_map
                token_no += 1

            for t in sudachi.tokenize(r):
                next_map: dict[int, int] = {}

                for c in self.db.execute(
                    """
                SELECT lines_id, token_id FROM lines_token
                WHERE token_id IN (SELECT id FROM "token" WHERE dict = ? {:s})
                {:s}
                """.format(
                        "AND ("
                        + " OR ".join(
                            f"pos LIKE '% {p} %'"
                            for p in t.part_of_speech()
                            if p != "*"
                        )
                        + ")",
                        f"AND lines_id NOT IN ({','.join(not_lines)})"
                        if len(not_lines)
                        else "",
                    ),
                    (t.dictionary_form(),),
                ):
                    if token_no and lines_id_map.get(c[0], -2) + 1 != c[1]:
                        not_lines.add(str(c[0]))
                        lines_id_map.pop(c[0], None)
                    else:
                        next_map[c[0]] = c[1]

                lines_id_map = next_map
                token_no += 1

        if len(lines_id_map):
            return [
                dict(r)
                for r in self.db.execute(
                    f"""
            SELECT "text", "time", filename
            FROM "lines"
            JOIN files_lines fl ON fl.lines_id = lines.id
            JOIN "files" ON fl.files_id = files.id
            WHERE lines_id IN ({','.join(str(k) for k in lines_id_map.keys())})
            """
                )
            ]

        return []

    def like(self, q: str):
        return [
            dict(r)
            for r in self.db.execute(
                f"""
        SELECT "text", "time", filename
        FROM "lines"
        JOIN files_lines fl ON fl.lines_id = lines.id
        JOIN "files" ON fl.files_id = files.id
        WHERE "text" LIKE '%'||?||'%'
        """,
                (q,),
            )
        ]

## requirements.txt
SudachiPy>=0.6.6
SudachiDict-Full>=20220729
regex>=2022.8.17
	import sqlite3
	from pathlib import Path

	import regex
	import sudachipy

	ROOT = Path(__file__).parent
	sudachi = sudachipy.Dictionary(dict_type="full").create()


	class DB:
	def __init__(self, db_path=ROOT / "idx.db") -> None:
	self.db = sqlite3.connect(db_path)
	self.db.row_factory = sqlite3.Row

	self.db.executescript(
	"""
	CREATE TABLE IF NOT EXISTS "token" (
	id INTEGER NOT NULL,
	dict TEXT NOT NULL,
	pos TEXT NOT NULL, -- ' ' in back, front, and separated
	PRIMARY KEY (id)
	);

	CREATE TABLE IF NOT EXISTS "files" (
	id INTEGER NOT NULL,
	filename TEXT NOT NULL,
	modified INT NOT NULL,
	PRIMARY KEY (id)
	);

	CREATE UNIQUE INDEX IF NOT EXISTS idx_files_filename ON "files"(filename);
	CREATE INDEX IF NOT EXISTS idx_files_modified ON "files"(modified);

	CREATE TABLE IF NOT EXISTS "lines" (
	id INTEGER NOT NULL,
	"time" TEXT NOT NULL,
	"text" TEXT NOT NULL,
	PRIMARY KEY (id)
	);

	CREATE TABLE IF NOT EXISTS files_lines (
	files_id INTEGER NOT NULL REFERENCES "files"(id) ON DELETE CASCADE,
	lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
	PRIMARY KEY (files_id, lines_id)
	);

	CREATE TABLE IF NOT EXISTS lines_token (
	lines_id INTEGER NOT NULL REFERENCES "lines"(id) ON DELETE CASCADE,
	token_id INTEGER NOT NULL REFERENCES "token"(id) ON DELETE CASCADE,
	PRIMARY KEY (lines_id, token_id)
	);
	"""
	)

	self._create()

	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc_value, exc_tb):
	self.db.close()

	def _create(self):
	assets_folder = ROOT.parent / "assets"
	re_srt = regex.compile(r"^(\d{2,}:\d{2}:\d{2},\d{3}) -->")
	re_num = regex.compile(r"^\d+$")
	re_ass = regex.compile(r"^(\d+:\d{2}:\d{2}\.\d{2}) ([^ ]+) (.+)$")
	re_ja = regex.compile(r"[\p{Han}\p{Katakana}\p{Hiragana}]", regex.UNICODE)

	for f in assets_folder.glob("*/.*"):
	filename = "/".join(f.relative_to(assets_folder).parts)
	modified = f.stat().st_mtime_ns

	for c in self.db.execute(
	'SELECT modified FROM "files" WHERE filename = ?',
	(filename,),
	):
	if c[0] >= modified:
	modified = 0

	if not modified:
	continue

	self.db.execute('DELETE FROM "files" WHERE filename = ?', (filename,))

	files_id = self.db.execute(
	'INSERT INTO "files" (filename, modified) VALUES (?, ?)',
	(filename, modified),
	).lastrowid

	timestamp = ""

	for r in f.open("r", encoding="utf-8"):
	r = r.rstrip()

	if not r:
	continue

	m = re_srt.match(r)
	if m:
	timestamp = m.group(1).replace(",", ".")

	if re_num.match(r):
	timestamp = ""

	if not re_ja.search(r):
	continue

	m = re_ass.match(r)
	if m:
	timestamp = m.group(0)
	r = m.group(2)

	if not timestamp:
	continue

	if not r:
	continue

	lines_id = self.db.execute(
	'INSERT INTO "lines" ("time", "text") VALUES (?, ?)', (timestamp, r)
	).lastrowid

	self.db.execute(
	"INSERT INTO files_lines (files_id, lines_id) VALUES (?, ?)",
	(files_id, lines_id),
	)

	for t in sudachi.tokenize(r):
	token_id = self.db.execute(
	'INSERT INTO "token" (dict, pos) VALUES (?, ?)',
	(
	t.dictionary_form(),
	" "
	+ " ".join(p for p in t.part_of_speech() if p != "*")
	+ " ",
	),
	).lastrowid

	self.db.execute(
	"INSERT INTO lines_token (lines_id, token_id) VALUES (?, ?)",
	(lines_id, token_id),
	)

	self.db.commit()

	def search(self, q: str):
	lines_id_map: dict[int, int] = {}
	not_lines: set[str] = set()

	token_no = 0

	for r in q.split("."):
	if token_no:
	next_map: dict[int, int] = {}

	for c in self.db.execute(
	"""
	SELECT lines_id, token_id FROM lines_token
	WHERE token_id IN (SELECT id FROM "token" WHERE pos LIKE '% 助詞 %')
	{:s}
	""".format(
	f"AND lines_id NOT IN ({','.join(not_lines)})"
	if len(not_lines)
	else "",
	)
	):
	if lines_id_map.get(c[0], -2) + 1 != c[1]:
	not_lines.add(str(c[0]))
	lines_id_map.pop(c[0], None)
	else:
	next_map[c[0]] = c[1]

	lines_id_map = next_map
	token_no += 1

	for t in sudachi.tokenize(r):
	next_map: dict[int, int] = {}

	for c in self.db.execute(
	"""
	SELECT lines_id, token_id FROM lines_token
	WHERE token_id IN (SELECT id FROM "token" WHERE dict = ? {:s})
	{:s}
	""".format(
	"AND ("
	+ " OR ".join(
	f"pos LIKE '% {p} %'"
	for p in t.part_of_speech()
	if p != "*"
	)
	+ ")",
	f"AND lines_id NOT IN ({','.join(not_lines)})"
	if len(not_lines)
	else "",
	),
	(t.dictionary_form(),),
	):
	if token_no and lines_id_map.get(c[0], -2) + 1 != c[1]:
	not_lines.add(str(c[0]))
	lines_id_map.pop(c[0], None)
	else:
	next_map[c[0]] = c[1]

	lines_id_map = next_map
	token_no += 1

	if len(lines_id_map):
	return [
	dict(r)
	for r in self.db.execute(
	f"""
	SELECT "text", "time", filename
	FROM "lines"
	JOIN files_lines fl ON fl.lines_id = lines.id
	JOIN "files" ON fl.files_id = files.id
	WHERE lines_id IN ({','.join(str(k) for k in lines_id_map.keys())})
	"""
	)
	]

	return []

	def like(self, q: str):
	return [
	dict(r)
	for r in self.db.execute(
	f"""
	SELECT "text", "time", filename
	FROM "lines"
	JOIN files_lines fl ON fl.lines_id = lines.id
	JOIN "files" ON fl.files_id = files.id
	WHERE "text" LIKE '%'\|\|?\|\|'%'
	""",
	(q,),
	)
	]