Last active
August 24, 2020 14:27
-
-
Save cwchentw/182787875389c2a664fcad011e3a4c61 to your computer and use it in GitHub Desktop.
KEDICT to SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* kedict2sqlite.groovy - Load kedict into a SQLite database. | |
Copyright (c) 2020 Michael Chen. | |
Licensed under MIT. | |
Copy kedict.yml to the same path of the script. | |
Then, run it with the following command: | |
$ groovy -cp path/to/sqlite-jdbc-x.y.z.jar kedict2sqlite.groovy | |
You may get a copy of kedict at https://github.com/mhagiwara/cc-kedict | |
*/ | |
@Grab(group='org.yaml', module='snakeyaml', version='1.26') | |
/* Due to a SQLite-JDBC issue, we have to load the JAR manually. */ | |
/* @Grab(group='org.xerial', module='sqlite-jdbc', version='3.31.1') */ | |
import java.nio.file.Paths | |
import org.yaml.snakeyaml.Yaml | |
import java.sql.DriverManager | |
import java.sql.SQLException | |
/* PoS = part of speech (詞性). */ | |
/* Translate a PoS abbr. to its full name. */ | |
String posTrans(s) { | |
switch (s) { | |
case "n": | |
return "noun" | |
case "v": | |
return "verb" | |
case "a": | |
return "adjective" | |
case "pron": | |
return "pronoun" | |
case "propn": | |
return "proper noun" | |
case "intj": | |
return "interjection" | |
case "det": | |
return "determiner" | |
case "part": | |
return "particle" | |
case "adv": | |
return "adverb" | |
case "num": | |
return "number" | |
case "abbrev": | |
return "abbreviation" | |
case "suf": | |
return "suffix" | |
case "pref": | |
return "prefix" | |
default: | |
throw "Unknown pos: ${s}" | |
} | |
} | |
/* Type of a parsed YAML object. */ | |
class Word { | |
String word | |
String romaja | |
String pos | |
String hanja /* nullable */ | |
List<Object> defs | |
List<String> syns /* nullable */ | |
List<String> ants /* nullable */ | |
List<String> rels /* nullable */ | |
List<String> ders /* nullable */ | |
List<String> cf /* nullable */ | |
List<String> tags /* nullable */ | |
List<String> conj /* nullable */ | |
/* Either List<Object> or String. | |
nullable */ | |
Object notes | |
} | |
final createWordTableQuery = """CREATE TABLE IF NOT EXISTS word | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
word TEXT NOT NULL)""" | |
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
pos TEXT NOT NULL)""" | |
final createHanjaTableQuery = """CREATE TABLE IF NOT EXISTS hanja | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
hanja TEXT NOT NULL, | |
word_id INTEGER, | |
pos_id INTEGER, | |
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id), | |
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))""" | |
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
definition TEXT NOT NULL, | |
word_id INTEGER, | |
pos_id INTEGER, | |
CONSTRAINT word_id FOREIGN KEY(id) REFERENCES word(id), | |
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))""" | |
/* Groovy way to get the directory of the script itself. */ | |
def path = new File(getClass().protectionDomain.codeSource.location.path).parent | |
def database = Paths.get(path, "kedict.sqlite") | |
/* Trick to use SQLite JDBC. */ | |
Class.forName("org.sqlite.JDBC") | |
groovy.sql.Sql conn = null | |
/* Create the tables in the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
conn.execute(createWordTableQuery) | |
conn.execute(createPOSTableQuery) | |
conn.execute(createHanjaTableQuery) | |
conn.execute(createTranslationTableQuery) | |
} | |
catch (SQLException e) | |
{ | |
println(e.getMessage()) | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
/* Here are the PoS of Korean. */ | |
def pos = ["noun", "proper noun", "pronoun", "number", | |
"verb", "adjective", "adverb", | |
"interjection", "determiner", "particle", | |
"abbreviation", "suffix", "prefix"] | |
/* Write the PoS data into the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
for (String p : pos) { | |
final insertPOSQuery = | |
"""INSERT INTO pos (pos) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)""" | |
conn.execute insertPOSQuery, p, p | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
/* Load kedict.yml into a list of YAML objects. */ | |
final yaml = new Yaml() | |
final input = new FileInputStream(new File("kedict.yml")) | |
List<Object> data = yaml.load(input) | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
/* Load one lexicon per iteration. */ | |
for (Object d : data) { | |
/* Java way to parse a YAML object into a Java object. */ | |
Word w = (Word) d | |
/* Insert non-duplicated word into the database. */ | |
final insertWordQuery = | |
"""INSERT INTO word (word) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM word WHERE word=?)""" | |
conn.execute insertWordQuery, w.word, w.word | |
final selectWordQuery = "SELECT id FROM word WHERE word=?" | |
final selectPOSQuery = "SELECT id FROM pos WHERE pos=?" | |
/* Get the id of the word. */ | |
def wid | |
conn.query(selectWordQuery, [w.word]) { result -> | |
while (result.next()) { | |
wid = result.getInt('id') | |
break | |
} | |
} | |
/* Get the id of the PoS. */ | |
def pid | |
conn.query(selectPOSQuery, [posTrans(w.pos)]) { result -> | |
while (result.next()) { | |
pid = result.getInt('id') | |
break | |
} | |
} | |
/* Write the non-duplicated hanja (漢字) | |
into the database only when it is available. */ | |
if (w.hanja) { | |
final insertHanjaQuery = | |
"""INSERT INTO hanja (hanja, word_id, pos_id) SELECT ?,?,? | |
WHERE NOT EXISTS (SELECT id FROM hanja WHERE hanja=?)""" | |
conn.execute insertHanjaQuery, w.hanja, wid, pid, w.hanja | |
} | |
/* Write the definitions of a word into the database. | |
One word usually owns multiple definitions. */ | |
for (Object df : w.defs) { | |
String trans = (String) df.def | |
final insertTransQuery = | |
"""INSERT INTO definition (definition, word_id, pos_id) SELECT ?,?,? | |
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND word_id=? AND pos_id=?)""" | |
conn.execute insertTransQuery, trans, wid, pid, trans, wid, pid | |
} | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment