KEDICT to SQLite
/* kedict2sqlite.groovy - Load kedict into a SQLite database. | |
Copyright (c) 2020 Michael Chen. | |
Licensed under MIT. | |
Copy kedict.yml to the same path of the script. | |
Then, run it with the following command: | |
$ groovy -cp path/to/sqlite-jdbc-x.y.z.jar kedict2sqlite.groovy | |
You may get a copy of kedict at https://github.com/mhagiwara/cc-kedict | |
*/ | |
@Grab(group='org.yaml', module='snakeyaml', version='1.26') | |
/* Due to a SQLite-JDBC issue, we have to load the JAR manually. */ | |
/* @Grab(group='org.xerial', module='sqlite-jdbc', version='3.31.1') */ | |
import java.nio.file.Paths | |
import org.yaml.snakeyaml.Yaml | |
import java.sql.DriverManager | |
import java.sql.SQLException | |
/* PoS = part of speech (詞性). */ | |
/* Translate a PoS abbr. to its full name. */ | |
String posTrans(s) { | |
switch (s) { | |
case "n": | |
return "noun" | |
case "v": | |
return "verb" | |
case "a": | |
return "adjective" | |
case "pron": | |
return "pronoun" | |
case "propn": | |
return "proper noun" | |
case "intj": | |
return "interjection" | |
case "det": | |
return "determiner" | |
case "part": | |
return "particle" | |
case "adv": | |
return "adverb" | |
case "num": | |
return "number" | |
case "abbrev": | |
return "abbreviation" | |
case "suf": | |
return "suffix" | |
case "pref": | |
return "prefix" | |
default: | |
throw "Unknown pos: ${s}" | |
} | |
} | |
/* Type of a parsed YAML object. */ | |
class Word { | |
String word | |
String romaja | |
String pos | |
String hanja /* nullable */ | |
List<Object> defs | |
List<String> syns /* nullable */ | |
List<String> ants /* nullable */ | |
List<String> rels /* nullable */ | |
List<String> ders /* nullable */ | |
List<String> cf /* nullable */ | |
List<String> tags /* nullable */ | |
List<String> conj /* nullable */ | |
/* Either List<Object> or String. | |
nullable */ | |
Object notes | |
} | |
final createWordTableQuery = """CREATE TABLE IF NOT EXISTS word | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
word TEXT NOT NULL)""" | |
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
pos TEXT NOT NULL)""" | |
final createHanjaTableQuery = """CREATE TABLE IF NOT EXISTS hanja | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
hanja TEXT NOT NULL, | |
word_id INTEGER, | |
pos_id INTEGER, | |
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id), | |
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))""" | |
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
definition TEXT NOT NULL, | |
word_id INTEGER, | |
pos_id INTEGER, | |
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id), | |
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))""" | |
/* Groovy way to get the directory of the script itself. */ | |
def path = new File(getClass().protectionDomain.codeSource.location.path).parent | |
def database = Paths.get(path, "kedict.sqlite") | |
/* Trick to use SQLite JDBC. */ | |
Class.forName("org.sqlite.JDBC") | |
groovy.sql.Sql conn = null | |
/* Create the tables in the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
conn.execute(createWordTableQuery) | |
conn.execute(createPOSTableQuery) | |
conn.execute(createHanjaTableQuery) | |
conn.execute(createTranslationTableQuery) | |
} | |
catch (SQLException e) | |
{ | |
println(e.getMessage()) | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
/* Here are the PoS of Korean. */ | |
def pos = ["noun", "proper noun", "pronoun", "number", | |
"verb", "adjective", "adverb", | |
"interjection", "determiner", "particle", | |
"abbreviation", "suffix", "prefix"] | |
/* Write the PoS data into the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
for (String p : pos) { | |
final insertPOSQuery = | |
"""INSERT INTO pos (pos) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)""" | |
conn.execute insertPOSQuery, p, p | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
/* Load kedict.yml into a list of YAML objects. */ | |
final yaml = new Yaml() | |
final input = new FileInputStream(new File("kedict.yml")) | |
List<Object> data = yaml.load(input) | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
/* Load one lexicon per iteration. */ | |
for (Object d : data) { | |
/* Java way to parse a YAML object into a Java object. */ | |
Word w = (Word) d | |
/* Insert non-duplicated word into the database. */ | |
final insertWordQuery = | |
"""INSERT INTO word (word) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM word WHERE word=?)""" | |
conn.execute insertWordQuery, w.word, w.word | |
final selectWordQuery = "SELECT id FROM word WHERE word=?" | |
final selectPOSQuery = "SELECT id FROM pos WHERE pos=?" | |
/* Get the id of the word. */ | |
def wid | |
conn.query(selectWordQuery, [w.word]) { result -> | |
while (result.next()) { | |
wid = result.getInt('id') | |
break | |
} | |
} | |
/* Get the id of the PoS. */ | |
def pid | |
conn.query(selectPOSQuery, [posTrans(w.pos)]) { result -> | |
while (result.next()) { | |
pid = result.getInt('id') | |
break | |
} | |
} | |
/* Write the non-duplicated hanja (漢字) | |
into the database only when it is available. */ | |
if (w.hanja) { | |
final insertHanjaQuery = | |
"""INSERT INTO hanja (hanja, word_id, pos_id) SELECT ?,?,? | |
WHERE NOT EXISTS (SELECT id FROM hanja WHERE hanja=?)""" | |
conn.execute insertHanjaQuery, w.hanja, wid, pid, w.hanja | |
} | |
/* Write the definitions of a word into the database. | |
One word usually owns multiple definitions. */ | |
for (Object df : w.defs) { | |
String trans = (String) df.def | |
final insertTransQuery = | |
"""INSERT INTO definition (definition, word_id, pos_id) SELECT ?,?,? | |
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND word_id=? AND pos_id=?)""" | |
conn.execute insertTransQuery, trans, wid, pid, trans, wid, pid | |
} | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment