cwchentw/kedict2sqlite.groovy

## kedict2sqlite.groovy
/* kedict2sqlite.groovy - Load kedict into a SQLite database.
   Copyright (c) 2020 Michael Chen.
   Licensed under MIT.


   Copy kedict.yml to the same path of the script.
   Then, run it with the following command:

       $ groovy -cp path/to/sqlite-jdbc-x.y.z.jar kedict2sqlite.groovy


   You may get a copy of kedict at https://github.com/mhagiwara/cc-kedict
*/


@Grab(group='org.yaml', module='snakeyaml', version='1.26')
/* Due to a SQLite-JDBC issue, we have to load the JAR manually. */
/* @Grab(group='org.xerial', module='sqlite-jdbc', version='3.31.1') */

import java.nio.file.Paths
import org.yaml.snakeyaml.Yaml
import java.sql.DriverManager
import java.sql.SQLException


/* PoS = part of speech (詞性). */
/* Translate a PoS abbr. to its full name. */
String posTrans(s) {
    switch (s) {
    case "n":
        return "noun"
    case "v":
        return "verb"
    case "a":
        return "adjective"
    case "pron":
        return "pronoun"
    case "propn":
        return "proper noun"
    case "intj":
        return "interjection"
    case "det":
        return "determiner"
    case "part":
        return "particle"
    case "adv":
        return "adverb"
    case "num":
        return "number"
    case "abbrev":
        return "abbreviation"
    case "suf":
        return "suffix"
    case "pref":
        return "prefix"
    default:
        throw "Unknown pos: ${s}"
    }
}

/* Type of a parsed YAML object. */
class Word {
    String word
    String romaja
    String pos
    String hanja  /* nullable */
    List<Object> defs
    List<String> syns  /* nullable */
    List<String> ants  /* nullable */
    List<String> rels  /* nullable */
    List<String> ders  /* nullable */
    List<String> cf    /* nullable */
    List<String> tags  /* nullable */
    List<String> conj  /* nullable */

    /* Either List<Object> or String.
       nullable */
    Object notes
}

final createWordTableQuery = """CREATE TABLE IF NOT EXISTS word
(id INTEGER PRIMARY KEY AUTOINCREMENT,
 word TEXT NOT NULL)"""
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos
(id INTEGER PRIMARY KEY AUTOINCREMENT,
 pos TEXT NOT NULL)"""
final createHanjaTableQuery = """CREATE TABLE IF NOT EXISTS hanja
(id INTEGER PRIMARY KEY AUTOINCREMENT,
 hanja TEXT NOT NULL,
 word_id INTEGER,
 pos_id INTEGER,
 CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
 CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition
(id INTEGER PRIMARY KEY AUTOINCREMENT,
 definition TEXT NOT NULL,
 word_id INTEGER,
 pos_id INTEGER,
 CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
 CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""

/* Groovy way to get the directory of the script itself. */
def path = new File(getClass().protectionDomain.codeSource.location.path).parent
def database = Paths.get(path, "kedict.sqlite")

/* Trick to use SQLite JDBC. */
Class.forName("org.sqlite.JDBC")

groovy.sql.Sql conn = null

/* Create the tables in the database. */
try {
    conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

    conn.execute(createWordTableQuery)
    conn.execute(createPOSTableQuery)
    conn.execute(createHanjaTableQuery)
    conn.execute(createTranslationTableQuery)
}
catch (SQLException e)
{
    println(e.getMessage())
}
finally {
    try {
        if (null != conn)
            conn.close()
    }
    catch (SQLException e) {
        println(e.getMessage())
    }
}

/* Here are the PoS of Korean. */
def pos = ["noun", "proper noun", "pronoun", "number",
    "verb", "adjective", "adverb",
    "interjection", "determiner", "particle",
    "abbreviation", "suffix", "prefix"]

/* Write the PoS data into the database. */
try {
    conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

    for (String p : pos) {
        final insertPOSQuery =
"""INSERT INTO pos (pos) SELECT ?
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)"""

        conn.execute insertPOSQuery, p, p
    }
}
finally {
    try {
        if (null != conn)
            conn.close()
    }
    catch (SQLException e) {
        println(e.getMessage())
    }
}


/* Load kedict.yml into a list of YAML objects. */
final yaml = new Yaml()
final input = new FileInputStream(new File("kedict.yml"))
List<Object> data = yaml.load(input)

try {
    conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

    /* Load one lexicon per iteration. */
    for (Object d : data) {
        /* Java way to parse a YAML object into a Java object. */
        Word w = (Word) d

        /* Insert non-duplicated word into the database. */
        final insertWordQuery =
"""INSERT INTO word (word) SELECT ?
WHERE NOT EXISTS (SELECT id FROM word WHERE word=?)"""

        conn.execute insertWordQuery, w.word, w.word

        final selectWordQuery = "SELECT id FROM word WHERE word=?"
        final selectPOSQuery = "SELECT id FROM pos WHERE pos=?"

        /* Get the id of the word. */
        def wid
        conn.query(selectWordQuery, [w.word]) { result ->
            while (result.next()) {
                wid = result.getInt('id')
                break
            }
        }

        /* Get the id of the PoS. */
        def pid
        conn.query(selectPOSQuery, [posTrans(w.pos)]) { result ->
            while (result.next()) {
                pid = result.getInt('id')
                break
            }
        }

        /* Write the non-duplicated hanja (漢字)
            into the database only when it is available. */
        if (w.hanja) {
            final insertHanjaQuery =
"""INSERT INTO hanja (hanja, word_id, pos_id) SELECT ?,?,?
WHERE NOT EXISTS (SELECT id FROM hanja WHERE hanja=?)"""

            conn.execute insertHanjaQuery, w.hanja, wid, pid, w.hanja
        }

        /* Write the definitions of a word into the database.
            One word usually owns multiple definitions. */
        for (Object df : w.defs) {
            String trans = (String) df.def

            final insertTransQuery =
"""INSERT INTO definition (definition, word_id, pos_id) SELECT ?,?,?
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND word_id=? AND pos_id=?)"""

            conn.execute insertTransQuery, trans, wid, pid, trans, wid, pid
        }
    }
}
finally {
    try {
        if (null != conn)
            conn.close()
    }
    catch (SQLException e) {
        println(e.getMessage())
    }
}
	/* kedict2sqlite.groovy - Load kedict into a SQLite database.
	Copyright (c) 2020 Michael Chen.
	Licensed under MIT.


	Copy kedict.yml to the same path of the script.
	Then, run it with the following command:

	$ groovy -cp path/to/sqlite-jdbc-x.y.z.jar kedict2sqlite.groovy


	You may get a copy of kedict at https://github.com/mhagiwara/cc-kedict
	*/


	@Grab(group='org.yaml', module='snakeyaml', version='1.26')
	/* Due to a SQLite-JDBC issue, we have to load the JAR manually. */
	/* @Grab(group='org.xerial', module='sqlite-jdbc', version='3.31.1') */

	import java.nio.file.Paths
	import org.yaml.snakeyaml.Yaml
	import java.sql.DriverManager
	import java.sql.SQLException


	/* PoS = part of speech (詞性). */
	/* Translate a PoS abbr. to its full name. */
	String posTrans(s) {
	switch (s) {
	case "n":
	return "noun"
	case "v":
	return "verb"
	case "a":
	return "adjective"
	case "pron":
	return "pronoun"
	case "propn":
	return "proper noun"
	case "intj":
	return "interjection"
	case "det":
	return "determiner"
	case "part":
	return "particle"
	case "adv":
	return "adverb"
	case "num":
	return "number"
	case "abbrev":
	return "abbreviation"
	case "suf":
	return "suffix"
	case "pref":
	return "prefix"
	default:
	throw "Unknown pos: ${s}"
	}
	}

	/* Type of a parsed YAML object. */
	class Word {
	String word
	String romaja
	String pos
	String hanja /* nullable */
	List<Object> defs
	List<String> syns /* nullable */
	List<String> ants /* nullable */
	List<String> rels /* nullable */
	List<String> ders /* nullable */
	List<String> cf /* nullable */
	List<String> tags /* nullable */
	List<String> conj /* nullable */

	/* Either List<Object> or String.
	nullable */
	Object notes
	}

	final createWordTableQuery = """CREATE TABLE IF NOT EXISTS word
	(id INTEGER PRIMARY KEY AUTOINCREMENT,
	word TEXT NOT NULL)"""
	final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos
	(id INTEGER PRIMARY KEY AUTOINCREMENT,
	pos TEXT NOT NULL)"""
	final createHanjaTableQuery = """CREATE TABLE IF NOT EXISTS hanja
	(id INTEGER PRIMARY KEY AUTOINCREMENT,
	hanja TEXT NOT NULL,
	word_id INTEGER,
	pos_id INTEGER,
	CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
	CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""
	final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition
	(id INTEGER PRIMARY KEY AUTOINCREMENT,
	definition TEXT NOT NULL,
	word_id INTEGER,
	pos_id INTEGER,
	CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
	CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""

	/* Groovy way to get the directory of the script itself. */
	def path = new File(getClass().protectionDomain.codeSource.location.path).parent
	def database = Paths.get(path, "kedict.sqlite")

	/* Trick to use SQLite JDBC. */
	Class.forName("org.sqlite.JDBC")

	groovy.sql.Sql conn = null

	/* Create the tables in the database. */
	try {
	conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

	conn.execute(createWordTableQuery)
	conn.execute(createPOSTableQuery)
	conn.execute(createHanjaTableQuery)
	conn.execute(createTranslationTableQuery)
	}
	catch (SQLException e)
	{
	println(e.getMessage())
	}
	finally {
	try {
	if (null != conn)
	conn.close()
	}
	catch (SQLException e) {
	println(e.getMessage())
	}
	}

	/* Here are the PoS of Korean. */
	def pos = ["noun", "proper noun", "pronoun", "number",
	"verb", "adjective", "adverb",
	"interjection", "determiner", "particle",
	"abbreviation", "suffix", "prefix"]

	/* Write the PoS data into the database. */
	try {
	conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

	for (String p : pos) {
	final insertPOSQuery =
	"""INSERT INTO pos (pos) SELECT ?
	WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)"""

	conn.execute insertPOSQuery, p, p
	}
	}
	finally {
	try {
	if (null != conn)
	conn.close()
	}
	catch (SQLException e) {
	println(e.getMessage())
	}
	}


	/* Load kedict.yml into a list of YAML objects. */
	final yaml = new Yaml()
	final input = new FileInputStream(new File("kedict.yml"))
	List<Object> data = yaml.load(input)

	try {
	conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")

	/* Load one lexicon per iteration. */
	for (Object d : data) {
	/* Java way to parse a YAML object into a Java object. */
	Word w = (Word) d

	/* Insert non-duplicated word into the database. */
	final insertWordQuery =
	"""INSERT INTO word (word) SELECT ?
	WHERE NOT EXISTS (SELECT id FROM word WHERE word=?)"""

	conn.execute insertWordQuery, w.word, w.word

	final selectWordQuery = "SELECT id FROM word WHERE word=?"
	final selectPOSQuery = "SELECT id FROM pos WHERE pos=?"

	/* Get the id of the word. */
	def wid
	conn.query(selectWordQuery, [w.word]) { result ->
	while (result.next()) {
	wid = result.getInt('id')
	break
	}
	}

	/* Get the id of the PoS. */
	def pid
	conn.query(selectPOSQuery, [posTrans(w.pos)]) { result ->
	while (result.next()) {
	pid = result.getInt('id')
	break
	}
	}

	/* Write the non-duplicated hanja (漢字)
	into the database only when it is available. */
	if (w.hanja) {
	final insertHanjaQuery =
	"""INSERT INTO hanja (hanja, word_id, pos_id) SELECT ?,?,?
	WHERE NOT EXISTS (SELECT id FROM hanja WHERE hanja=?)"""

	conn.execute insertHanjaQuery, w.hanja, wid, pid, w.hanja
	}

	/* Write the definitions of a word into the database.
	One word usually owns multiple definitions. */
	for (Object df : w.defs) {
	String trans = (String) df.def

	final insertTransQuery =
	"""INSERT INTO definition (definition, word_id, pos_id) SELECT ?,?,?
	WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND word_id=? AND pos_id=?)"""

	conn.execute insertTransQuery, trans, wid, pid, trans, wid, pid
	}
	}
	}
	finally {
	try {
	if (null != conn)
	conn.close()
	}
	catch (SQLException e) {
	println(e.getMessage())
	}
	}