a-chen/remove-chinese-from-openlp.py

## remove-chinese-from-openlp.py
# python 3.x

# Script to remove Chinese text from the OpenLP song title and lyrics
# Chinese lyrics are typically prepended by {y} tags, and suffixed with {/y} tags
# SQL examples here https://www.digitalocean.com/community/tutorials/how-to-use-the-sqlite3-module-in-python-3

# USAGE
# point this script at songs.sqlite OpenLP file
#   python3 ./remove-chinese-from-openlp.py songs.sqlite
# created to strip Chinese off of CBM's OpenLP song files where the Chinese text is surrounded by tags
# titles are matched against (hopefully) all Chinese characters, but there weren't many matches anyways. Can be modified to target the body text as well
# Author: Andrew Chen achen.this@gmail.com

import sqlite3
import re
import datetime
import os
import logging
import sys

# can't set w mode for filemode for some reason...manually deleting debug every time
if os.path.isfile("debug.log"):
    os.remove("debug.log")

log = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler(sys.stdout)
    ]
)

chinese_only_song_titles = [
    "H710 睡罷親愛",
    "H700 慈仁牧人",
    "H190 恩愛標本",
    "H056 在加利利沿海之地",
    "H057 耶穌我主榮耀王",
    "H101 釋放,榮耀釋放",
    "H141 耶穌得勝 歡聲雷動",
    "H142 更美",
    "H154 我們的大君彌賽亞",
    "H159 我能否忘快來的主",
    "H175 我們雖然時常搖動",
    "H166 耶穌這名甜美芬芳",
    "H206(2) 有祂同在就是天堂",
    "H244 求主用血洗我清潔",
    "H230 主阿我深愛你",
    "H263 主的妙愛",
    "H265 從前我亦曾熱心",
    "H266 主是我否",
    "H292 救主你的如火眼睛",
    "H275 往前走",
    "H346 思念主",
    "H345 我已揀選主耶穌",
    "H351 住在主裏面",
    "H368 我為基督而生",
    "H370 父阿,是的",
    "H499 我今所知主恩有限",
    "H403 聯於基督",
    "H415 嘴唇不潔,心思不潔",
    "H416 當我蒙恩能夠施恩",
    "H417 一生聰明未遇敵手",
    "H423 哦,各各他",
    "H447 我無能力,我的主",
    "H446 神的應許不能廢去",
    "H444 我與神聖的愛立約",
    "H437 主阿,即或盡是黑暗",
    "H473 求握我手引領我",
    "H510 當我在於最暗時間",
    "H526 當那偉大日子",
    "H525 主,當我們被提上升",
    "H513 主已知道你的挫折",
    "H515 現今時候已不多",
    "H524 主耶穌,我正等待",
    "H533 更美地",
    "H534 天是我家",
    "H543 哦主撒冷是你所建設",
    "H582 親愛主寶貝主",
    "H574(2) 無別聲音破此靜寂",
    "H656 當你苦受撒但試探",
    "H665 從軍的教會",
    "H675 神的子民被寇",
    "H695 弟兄和睦同居",
    "H693 我們呼吸天上空氣",
    "H682 阿爸,我們進前來",
    "H643 神聖呼召",
    "H661 耶穌的十字架",
    "H664 福音執事當宣告",
    "H669 爭戰原來不是你的",
    "H020 神的大愛",
    "H013 主耶和華是我的力量",
    "H610 罪惡途中多年流蕩",
]


# building chinese regex
# -*- coding: utf-8 -*-
# taken from https://stackoverflow.com/questions/2718196/find-all-chinese-text-in-a-string-using-python-and-regex
def build_re():
    L = []
    for i in LHan:
        if isinstance(i, list):
            f, t = i
            try:
                f = chr(f)
                t = chr(t)
                L.append('%s-%s' % (f, t))
            except:
                pass  # A narrow python build, so can't use chars > 65535 without surrogate pairs!

        else:
            try:
                L.append(chr(i))
            except:
                pass

    RE = '[%s]' % ''.join(L)
    return re.compile(RE, re.UNICODE)


LHan = [[0x2E80, 0x2E99],  # Han # So  [26] CJK RADICAL REPEAT, CJK RADICAL RAP
        [0x2E9B, 0x2EF3],  # Han # So  [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
        [0x2F00, 0x2FD5],  # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
        0x3005,  # Han # Lm       IDEOGRAPHIC ITERATION MARK
        0x3007,  # Han # Nl       IDEOGRAPHIC NUMBER ZERO
        [0x3021, 0x3029],  # Han # Nl   [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
        [0x3038, 0x303A],  # Han # Nl   [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
        0x303B,  # Han # Lm       VERTICAL IDEOGRAPHIC ITERATION MARK
        [0x3400, 0x4DB5],  # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
        [0x4E00, 0x9FC3],  # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
        [0xF900, 0xFA2D],  # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
        [0xFA30, 0xFA6A],  # Han # Lo  [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
        [0xFA70, 0xFAD9],  # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
        [0x20000, 0x2A6D6],  # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
        [0x2F800, 0x2FA1D]]  # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D
chinese_re_pattern = build_re()


# Hymnary songs begin with Hxxx
# CBM chorus songs begin with Ch*
# CBM youth songs begin with SBxxx
# Christ in Song begin with CISxxx
# Unsure who's this is Cxxx
def remove_chinese_from_song_starting_with(prefix, cursor):
    log.debug("Starting Chinese removal for songs starting with {}".format(prefix))
    query = "select title, lyrics from songs where title like '{0}%'".format(prefix)
    rows = cursor.execute(query).fetchall()

    num_lyrics_updated = 0
    num_titles_updated = 0

    for r in rows:
        title = r[0]
        lyrics = r[1]

        modified_title = title
        modified_lyrics = lyrics

        if title in chinese_only_song_titles:
            continue  # skip the loop, ignore Chinese only songs

        # clean title
        title_has_chinese = chinese_re_pattern.search(title)
        if title_has_chinese is not None:
            log.debug("{} found to have Chinese title".format(title))
            modified_title = chinese_re_pattern.sub('', title.strip())
            num_titles_updated += 1

        # clean lyrics
        # only works if Chinese characters are surrounded by {y}{/y} syntax
        # examines lyrics and removes everything inside {y}
        lyrics_search_pattern = '{y}.*?{/y}'  # non-greedy
        match_or_not = re.search(lyrics_search_pattern, lyrics, re.DOTALL)
        if match_or_not is not None:
            log.debug(title + " found to have Chinese lyrics")
            modified_lyrics = re.sub(lyrics_search_pattern, '', lyrics, flags=re.DOTALL)
            modified_lyrics = cleanup_empty_lines(modified_lyrics)
            num_lyrics_updated += 1

        # update song
        cursor.execute(
            "update songs set title = ?, lyrics = ? where title = ?",
            (modified_title, modified_lyrics, title))

        log.debug(f"{title} updated with title: {modified_title if modified_title is not None else title} and lyrics: {modified_lyrics if modified_lyrics is not None else lyrics}")

    log.info("{} songs fetched to process for titles starting with {}".format(len(rows), prefix))
    log.info("{} song titles matched and processed for titles starting with {}".format(num_titles_updated, prefix))
    log.info("{} song lyrics matched and processed for titles starting with {}".format(num_lyrics_updated, prefix))
    log.info("-------------------------------------------")


# we want a max of two newline next to each other
# any newlines beyond two is reduced to two
def cleanup_empty_lines(string):
    pattern = '[\\n]{3,}'
    return re.sub(pattern, '\n\n', string, flags=re.DOTALL)


def update_version(cursor):
    query = "select title, lyrics from songs where title like '{}%'".format("Version")
    rows = cursor.execute(query).fetchall()

    for r in rows:
        lyrics = r[1]
        title = r[0]
        timestamp = datetime.datetime.fromtimestamp(datetime.datetime.now().timestamp())
        replacement_string = "remove-chinese-project-version:v{}".format(timestamp)
        log.debug(f"Current version: {lyrics}")
        cursor.execute("update songs set lyrics = ? where title = ?", (replacement_string, title))

    query = "select title, lyrics from songs where title like '{}%'".format("Version")
    version = cursor.execute(query).fetchone()[1]
    log.info(f"Version updated to {version}")


def main():
    # grab the songs.sqlite file from OpenLP data directory
    with sqlite3.connect('songs.sqlite') as conn:
        cursor = conn.cursor()
        remove_chinese_from_song_starting_with('H', cursor)
        remove_chinese_from_song_starting_with("CIS", cursor)
        remove_chinese_from_song_starting_with("Ch", cursor)
        remove_chinese_from_song_starting_with("C", cursor)
        update_version(cursor)
    log.info("Done")


if __name__ == "__main__":
    main()