Skip to content

Instantly share code, notes, and snippets.

@cwchentw
Last active May 3, 2021 10:48
Show Gist options
  • Save cwchentw/eec45de6fc7854a84805981250782aae to your computer and use it in GitHub Desktop.
Save cwchentw/eec45de6fc7854a84805981250782aae to your computer and use it in GitHub Desktop.
JMdict to SQLite
/* jmdict2sqlite.groovy - Load JMdict into a SQLite database.
Copyright (c) 2020 Michael Chen.
Licensed under MIT.
Copy JMdict_e to the same path of the script.
Then, run it with the following command:
$ groovy -cp path/to/sqlite-jdbc-3.30.1.jar -DentityExpansionLimit=1000000 jmdict2sqlite.groovy
You may get a copy of JMdict at http://www.edrdg.org/jmdict/edict_doc.html
*/
import java.nio.file.Paths
import java.sql.DriverManager
import java.sql.SQLException
String posTrans(String pos) {
final POS_NULL = ""
final NOUN = "noun"
final PRONOUN = "pronoun"
final VERB = "verb"
final ADJECTIVE = "adjective"
final ADJECTIVAL_NOUN = "adjectival noun"
final ADVERB = "adverb"
final INTERJECTION = "interjection"
final AUXILIARY = "auxiliary"
final AUXILIARY_VERB = "auxiliary verb"
final PARTICLE = "particle"
final CONJUNCTION = "conjunction"
final ATTRIBUTIVE = "attributive"
final UNCLASSIFIED = "unclassified"
switch (pos) {
case "adjective (keiyoushi)":
return ADJECTIVE
case "adjective (keiyoushi) - yoi/ii class":
return ADJECTIVE
case "adjectival nouns or quasi-adjectives (keiyodoshi)":
return ADJECTIVAL_NOUN
case "nouns which may take the genitive case particle `no'":
return NOUN
case "pre-noun adjectival (rentaishi)":
return ATTRIBUTIVE
case "`taru' adjective":
return ADJECTIVAL_NOUN
case "noun or verb acting prenominally":
return POS_NULL
case "adverb (fukushi)":
return ADVERB
case "adverb taking the `to' particle":
return ADVERB
case "auxiliary":
return AUXILIARY
case "auxiliary verb":
return AUXILIARY_VERB
case "auxiliary adjective":
return ADJECTIVE
case "conjunction":
return CONJUNCTION
case "interjection (kandoushi)":
return INTERJECTION
case "noun (common) (futsuumeishi)":
return NOUN
case "adverbial noun (fukushitekimeishi)":
return NOUN
case "noun, used as a suffix":
return NOUN
case "noun, used as a prefix":
return NOUN
case "noun (temporal) (jisoumeishi)":
return NOUN
case "pronoun":
return PRONOUN
case "particle":
return PARTICLE
case "unclassified":
return UNCLASSIFIED
case "Ichidan verb":
return VERB
case "Ichidan verb - kureru special class":
return VERB
case "Nidan verb with 'u' ending (archaic)":
return VERB
case "Yodan verb with `hu/fu' ending (archaic)":
return VERB
case "Yodan verb with `ru' ending (archaic)":
return VERB
case "Godan verb - -aru special class":
return VERB
case "Godan verb with `bu' ending":
return VERB
case "Godan verb with `gu' ending":
return VERB
case "Godan verb with `ku' ending":
return VERB
case "Godan verb - Iku/Yuku special class":
return VERB
case "Godan verb with `mu' ending":
return VERB
case "Godan verb with `nu' ending":
return VERB
case "Godan verb with `ru' ending":
return VERB
case "Godan verb with `ru' ending (irregular verb)":
return VERB
case "Godan verb with `su' ending":
return VERB
case "Godan verb with `tsu' ending":
return VERB
case "Godan verb with `u' ending":
return VERB
case "Godan verb with `u' ending (special class)":
return VERB
case "Godan verb - Uru old class verb (old form of Eru)":
return VERB
case "Ichidan verb - zuru verb (alternative form of -jiru verbs)":
return VERB
case "Kuru verb - special class":
return VERB
case "irregular nu verb":
return VERB
case "irregular ru verb, plain form ends with -ri":
return VERB
case "noun or participle which takes the aux. verb suru":
return VERB
case "su verb - precursor to the modern suru":
return VERB
case "suru verb - special class":
return VERB
case "suru verb - included":
return VERB
case "suru verb - irregular":
return VERB
case "`kari' adjective (archaic)":
return ADJECTIVE
case "`ku' adjective (archaic)":
return ADJECTIVE
case "`shiku' adjective (archaic)":
return ADJECTIVE
case "archaic/formal form of na-adjective":
return ADJECTIVAL_NOUN
case "proper noun":
return NOUN
case "verb unspecified":
return VERB
case "Yodan verb with `ku' ending (archaic)":
return VERB
case "Yodan verb with `gu' ending (archaic)":
return VERB
case "Yodan verb with `su' ending (archaic)":
return VERB
case "Yodan verb with `tsu' ending (archaic)":
return VERB
case "Yodan verb with `nu' ending (archaic)":
return VERB
case "Yodan verb with `bu' ending (archaic)":
return VERB
case "Yodan verb with `mu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `ku' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `gu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `tsu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `dzu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `hu/fu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `bu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `mu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `yu' ending (archaic)":
return VERB
case "Nidan verb (upper class) with `ru' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `ku' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `gu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `su' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `zu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `tsu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `dzu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `nu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `hu/fu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `bu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `mu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `yu' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `ru' ending (archaic)":
return VERB
case "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)":
return VERB
default:
return POS_NULL
}
return POS_NULL
}
/* Groovy way to get the directory of the script itself. */
def path = new File(getClass().protectionDomain.codeSource.location.path).parent
def database = Paths.get(path, "jmdict.sqlite")
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos
(id INTEGER PRIMARY KEY AUTOINCREMENT,
pos TEXT NOT NULL)"""
final createKanaTableQuery = """CREATE TABLE IF NOT EXISTS kana
(id INTEGER PRIMARY KEY AUTOINCREMENT,
kana TEXT NOT NULL)"""
final createKanjiTableQuery = """CREATE TABLE IF NOT EXISTS kanji
(id INTEGER PRIMARY KEY AUTOINCREMENT,
kanji TEXT NOT NULL)"""
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition
(id INTEGER PRIMARY KEY AUTOINCREMENT,
definition TEXT NOT NULL,
kana_id INTEGER,
kanji_id INTEGER,
pos_id INTEGER,
CONSTRAINT kana_id FOREIGN KEY(id) REFERENCES kana(id),
CONSTRAINT kanji_id FOREIGN KEY(id) REFERENCES kanji(id),
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))"""
/* Trick to use SQLite JDBC. */
Class.forName("org.sqlite.JDBC")
groovy.sql.Sql conn = null
/* Create the tables in the database. */
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
conn.execute(createPOSTableQuery)
conn.execute(createKanaTableQuery)
conn.execute(createKanjiTableQuery)
conn.execute(createTranslationTableQuery)
}
catch (SQLException e)
{
println(e.getMessage())
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
def pos = ["noun", "pronoun", "verb", "adjective", "adjectival noun", "adverb",
"auxiliary verb", "particle", "conjunction", "interjection", "attributive",
"auxiliary", "unclassified"]
/* Write the PoS data into the database. */
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
for (String p : pos) {
final insertPOSQuery =
"""INSERT INTO pos (pos) SELECT ?
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)"""
conn.execute insertPOSQuery, p, p
}
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
/* Read the dictionary file as a XML object. */
def dict = new XmlSlurper(false, false, true).parse(new File("JMdict_e"))
final insertKanaQuery =
"""INSERT INTO kana (kana) SELECT ?
WHERE NOT EXISTS (SELECT id FROM kana WHERE kana=?)"""
final insertKanjiQuery =
"""INSERT INTO kanji (kanji) SELECT ?
WHERE NOT EXISTS (SELECT id FROM kanji WHERE kanji=?)"""
final insertTransQuery =
"""INSERT INTO definition (definition, kana_id, pos_id) SELECT ?,?,?
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND kana_id=? AND pos_id=?)"""
final insertTransWithKanjiQuery =
"""INSERT INTO definition (definition, kana_id, kanji_id, pos_id) SELECT ?,?,?,?
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND kana_id=? AND kanji_id=? AND pos_id=?)"""
final selectKanaQuery = "SELECT id FROM kana WHERE kana=?"
final selectPOSQuery = "SELECT id FROM pos WHERE pos=?"
final selectKanjiQuery = "SELECT id FROM kanji WHERE kanji=?"
System.setOut(new PrintStream(System.out, true, "UTF-8"))
try {
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC")
/* Iterate over the entries of the dictionary. */
dict.entry.each { entry ->
/* Extract kana(s) from an entry. */
def kana = []
entry.r_ele.each { r_ele ->
kana << r_ele.reb.toString()
}
/* Extract kanji(s) from an entry. */
def kanji = []
entry.k_ele.each { k_ele ->
kanji << k_ele.keb.toString()
}
/* Extract translation(s) from an entry. */
def posList = []
entry.sense.each { sense ->
def glossList = []
if (sense.pos.size() > 0) {
posList = []
sense.pos.each { p ->
posList << p.toString()
}
}
sense.gloss.each { gloss ->
glossList << gloss.toString()
}
def definition = String.join("; ", glossList)
System.out.println "kana: ${kana}"
System.out.println "kanji: ${kanji}"
System.out.println "pos: ${posList}"
if (kanji.size() <= 0) {
kana.each { na ->
posList.each { p ->
System.out.println "Record: ${na} ${p}"
conn.execute insertKanaQuery, na, na
def kana_id
conn.query(selectKanaQuery, [na]) { result ->
while (result.next()) {
kana_id = result.getInt('id')
break
}
}
def pos_id = -1
conn.query(selectPOSQuery, [posTrans(p)]) { result ->
while (result.next()) {
pos_id = result.getInt('id')
break
}
}
if (pos_id <= 0) {
return
}
conn.execute insertTransQuery, definition, kana_id, pos_id, definition, kana_id, pos_id
}
}
}
else {
kana.each { na ->
kanji.each { ji ->
posList.each { p ->
System.out.println "Record: ${na} ${ji} ${p}"
conn.execute insertKanaQuery, na, na
def kana_id
conn.query(selectKanaQuery, [na]) { result ->
while (result.next()) {
kana_id = result.getInt('id')
break
}
}
def pos_id = -1
conn.query(selectPOSQuery, [posTrans(p)]) { result ->
while (result.next()) {
pos_id = result.getInt('id')
break
}
}
if (pos_id <= 0) {
return
}
conn.execute insertKanjiQuery, ji, ji
def kanji_id = -1
conn.query(selectKanjiQuery, [ji]) { result ->
while (result.next()) {
kanji_id = result.getInt('id')
break
}
}
if (kanji_id <= 0) {
return
}
conn.execute insertTransWithKanjiQuery, definition, kana_id, kanji_id, pos_id, definition, kana_id, kanji_id, pos_id
}
}
}
}
}
}
}
finally {
try {
if (null != conn)
conn.close()
}
catch (SQLException e) {
println(e.getMessage())
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment