Skip to content

Instantly share code, notes, and snippets.

@cwchentw
Last active August 24, 2020 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwchentw/182787875389c2a664fcad011e3a4c61 to your computer and use it in GitHub Desktop.
Save cwchentw/182787875389c2a664fcad011e3a4c61 to your computer and use it in GitHub Desktop.
KEDICT to SQLite
/* kedict2sqlite.groovy - Load kedict into a SQLite database.
Copyright (c) 2020 Michael Chen.
Licensed under MIT.
Copy kedict.yml to the same path of the script.
Then, run it with the following command:
$ groovy -cp path/to/sqlite-jdbc-x.y.z.jar kedict2sqlite.groovy
You may get a copy of kedict at https://github.com/mhagiwara/cc-kedict
*/
@Grab(group='org.yaml', module='snakeyaml', version='1.26')
/* Due to a SQLite-JDBC issue, we have to load the JAR manually. */
/* @Grab(group='org.xerial', module='sqlite-jdbc', version='3.31.1') */
import java.nio.file.Paths
import org.yaml.snakeyaml.Yaml
import java.sql.DriverManager
import java.sql.SQLException
/* PoS = part of speech (詞性). */
/* Translate a PoS abbr. to its full name. */
String posTrans(s) {
switch (s) {
case "n":
return "noun"
case "v":
return "verb"
case "a":
return "adjective"
case "pron":
return "pronoun"
case "propn":
return "proper noun"
case "intj":
return "interjection"
case "det":
return "determiner"
case "part":
return "particle"
case "adv":
return "adverb"
case "num":
return "number"
case "abbrev":
return "abbreviation"
case "suf":
return "suffix"
case "pref":
return "prefix"
default:
throw "Unknown pos: ${s}"
}
}
/* Type of a parsed YAML object. */
class Word {
String word
String romaja
String pos
String hanja /* nullable */
List<Object> defs
List<String> syns /* nullable */
List<String> ants /* nullable */
List<String> rels /* nullable */
List<String> ders /* nullable */
List<String> cf /* nullable */
List<String> tags /* nullable */
List<String> conj /* nullable */
/* Either List<Object> or String.
nullable */
Object notes
}
final createWordTableQuery = """CREATE TABLE IF NOT EXISTS word
(id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT NOT NULL)"""
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos
(id INTEGER PRIMARY KEY AUTOINCREMENT,
pos TEXT NOT NULL)"""
final createHanjaTableQuery = """CREATE TABLE IF NOT EXISTS hanja
(id INTEGER PRIMARY KEY AUTOINCREMENT,
hanja TEXT NOT NULL,
word_id INTEGER,
pos_id INTEGER,
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition
(id INTEGER PRIMARY KEY AUTOINCREMENT,
definition TEXT NOT NULL,
word_id INTEGER,
pos_id INTEGER,
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id),
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""
/* Groovy way to get the directory of the script itself. */
def path = new File(getClass().protectionDomain.codeSource.location.path).parent
def database = Paths.get(path, "kedict.sqlite")
/* Trick to use SQLite JDBC. */
Class.forName("org.sqlite.JDBC")
groovy.sql.Sql conn = null
/* Create the tables in the database. */
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
conn.execute(createWordTableQuery)
conn.execute(createPOSTableQuery)
conn.execute(createHanjaTableQuery)
conn.execute(createTranslationTableQuery)
}
catch (SQLException e)
{
println(e.getMessage())
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
/* Here are the PoS of Korean. */
def pos = ["noun", "proper noun", "pronoun", "number",
"verb", "adjective", "adverb",
"interjection", "determiner", "particle",
"abbreviation", "suffix", "prefix"]
/* Write the PoS data into the database. */
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
for (String p : pos) {
final insertPOSQuery =
"""INSERT INTO pos (pos) SELECT ?
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)"""
conn.execute insertPOSQuery, p, p
}
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
/* Load kedict.yml into a list of YAML objects. */
final yaml = new Yaml()
final input = new FileInputStream(new File("kedict.yml"))
List<Object> data = yaml.load(input)
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
/* Load one lexicon per iteration. */
for (Object d : data) {
/* Java way to parse a YAML object into a Java object. */
Word w = (Word) d
/* Insert non-duplicated word into the database. */
final insertWordQuery =
"""INSERT INTO word (word) SELECT ?
WHERE NOT EXISTS (SELECT id FROM word WHERE word=?)"""
conn.execute insertWordQuery, w.word, w.word
final selectWordQuery = "SELECT id FROM word WHERE word=?"
final selectPOSQuery = "SELECT id FROM pos WHERE pos=?"
/* Get the id of the word. */
def wid
conn.query(selectWordQuery, [w.word]) { result ->
while (result.next()) {
wid = result.getInt('id')
break
}
}
/* Get the id of the PoS. */
def pid
conn.query(selectPOSQuery, [posTrans(w.pos)]) { result ->
while (result.next()) {
pid = result.getInt('id')
break
}
}
/* Write the non-duplicated hanja (漢字)
into the database only when it is available. */
if (w.hanja) {
final insertHanjaQuery =
"""INSERT INTO hanja (hanja, word_id, pos_id) SELECT ?,?,?
WHERE NOT EXISTS (SELECT id FROM hanja WHERE hanja=?)"""
conn.execute insertHanjaQuery, w.hanja, wid, pid, w.hanja
}
/* Write the definitions of a word into the database.
One word usually owns multiple definitions. */
for (Object df : w.defs) {
String trans = (String) df.def
final insertTransQuery =
"""INSERT INTO definition (definition, word_id, pos_id) SELECT ?,?,?
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND word_id=? AND pos_id=?)"""
conn.execute insertTransQuery, trans, wid, pid, trans, wid, pid
}
}
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment