Last active
May 3, 2021 10:48
-
-
Save cwchentw/eec45de6fc7854a84805981250782aae to your computer and use it in GitHub Desktop.
JMdict to SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* jmdict2sqlite.groovy - Load JMdict into a SQLite database. | |
Copyright (c) 2020 Michael Chen. | |
Licensed under MIT. | |
Copy JMdict_e to the same path of the script. | |
Then, run it with the following command: | |
$ groovy -cp path/to/sqlite-jdbc-3.30.1.jar -DentityExpansionLimit=1000000 jmdict2sqlite.groovy | |
You may get a copy of JMdict at http://www.edrdg.org/jmdict/edict_doc.html | |
*/ | |
import java.nio.file.Paths | |
import java.sql.DriverManager | |
import java.sql.SQLException | |
String posTrans(String pos) { | |
final POS_NULL = "" | |
final NOUN = "noun" | |
final PRONOUN = "pronoun" | |
final VERB = "verb" | |
final ADJECTIVE = "adjective" | |
final ADJECTIVAL_NOUN = "adjectival noun" | |
final ADVERB = "adverb" | |
final INTERJECTION = "interjection" | |
final AUXILIARY = "auxiliary" | |
final AUXILIARY_VERB = "auxiliary verb" | |
final PARTICLE = "particle" | |
final CONJUNCTION = "conjunction" | |
final ATTRIBUTIVE = "attributive" | |
final UNCLASSIFIED = "unclassified" | |
switch (pos) { | |
case "adjective (keiyoushi)": | |
return ADJECTIVE | |
case "adjective (keiyoushi) - yoi/ii class": | |
return ADJECTIVE | |
case "adjectival nouns or quasi-adjectives (keiyodoshi)": | |
return ADJECTIVAL_NOUN | |
case "nouns which may take the genitive case particle `no'": | |
return NOUN | |
case "pre-noun adjectival (rentaishi)": | |
return ATTRIBUTIVE | |
case "`taru' adjective": | |
return ADJECTIVAL_NOUN | |
case "noun or verb acting prenominally": | |
return POS_NULL | |
case "adverb (fukushi)": | |
return ADVERB | |
case "adverb taking the `to' particle": | |
return ADVERB | |
case "auxiliary": | |
return AUXILIARY | |
case "auxiliary verb": | |
return AUXILIARY_VERB | |
case "auxiliary adjective": | |
return ADJECTIVE | |
case "conjunction": | |
return CONJUNCTION | |
case "interjection (kandoushi)": | |
return INTERJECTION | |
case "noun (common) (futsuumeishi)": | |
return NOUN | |
case "adverbial noun (fukushitekimeishi)": | |
return NOUN | |
case "noun, used as a suffix": | |
return NOUN | |
case "noun, used as a prefix": | |
return NOUN | |
case "noun (temporal) (jisoumeishi)": | |
return NOUN | |
case "pronoun": | |
return PRONOUN | |
case "particle": | |
return PARTICLE | |
case "unclassified": | |
return UNCLASSIFIED | |
case "Ichidan verb": | |
return VERB | |
case "Ichidan verb - kureru special class": | |
return VERB | |
case "Nidan verb with 'u' ending (archaic)": | |
return VERB | |
case "Yodan verb with `hu/fu' ending (archaic)": | |
return VERB | |
case "Yodan verb with `ru' ending (archaic)": | |
return VERB | |
case "Godan verb - -aru special class": | |
return VERB | |
case "Godan verb with `bu' ending": | |
return VERB | |
case "Godan verb with `gu' ending": | |
return VERB | |
case "Godan verb with `ku' ending": | |
return VERB | |
case "Godan verb - Iku/Yuku special class": | |
return VERB | |
case "Godan verb with `mu' ending": | |
return VERB | |
case "Godan verb with `nu' ending": | |
return VERB | |
case "Godan verb with `ru' ending": | |
return VERB | |
case "Godan verb with `ru' ending (irregular verb)": | |
return VERB | |
case "Godan verb with `su' ending": | |
return VERB | |
case "Godan verb with `tsu' ending": | |
return VERB | |
case "Godan verb with `u' ending": | |
return VERB | |
case "Godan verb with `u' ending (special class)": | |
return VERB | |
case "Godan verb - Uru old class verb (old form of Eru)": | |
return VERB | |
case "Ichidan verb - zuru verb (alternative form of -jiru verbs)": | |
return VERB | |
case "Kuru verb - special class": | |
return VERB | |
case "irregular nu verb": | |
return VERB | |
case "irregular ru verb, plain form ends with -ri": | |
return VERB | |
case "noun or participle which takes the aux. verb suru": | |
return VERB | |
case "su verb - precursor to the modern suru": | |
return VERB | |
case "suru verb - special class": | |
return VERB | |
case "suru verb - included": | |
return VERB | |
case "suru verb - irregular": | |
return VERB | |
case "`kari' adjective (archaic)": | |
return ADJECTIVE | |
case "`ku' adjective (archaic)": | |
return ADJECTIVE | |
case "`shiku' adjective (archaic)": | |
return ADJECTIVE | |
case "archaic/formal form of na-adjective": | |
return ADJECTIVAL_NOUN | |
case "proper noun": | |
return NOUN | |
case "verb unspecified": | |
return VERB | |
case "Yodan verb with `ku' ending (archaic)": | |
return VERB | |
case "Yodan verb with `gu' ending (archaic)": | |
return VERB | |
case "Yodan verb with `su' ending (archaic)": | |
return VERB | |
case "Yodan verb with `tsu' ending (archaic)": | |
return VERB | |
case "Yodan verb with `nu' ending (archaic)": | |
return VERB | |
case "Yodan verb with `bu' ending (archaic)": | |
return VERB | |
case "Yodan verb with `mu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `ku' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `gu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `tsu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `dzu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `hu/fu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `bu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `mu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `yu' ending (archaic)": | |
return VERB | |
case "Nidan verb (upper class) with `ru' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `ku' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `gu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `su' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `zu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `tsu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `dzu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `nu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `hu/fu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `bu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `mu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `yu' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `ru' ending (archaic)": | |
return VERB | |
case "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)": | |
return VERB | |
default: | |
return POS_NULL | |
} | |
return POS_NULL | |
} | |
/* Groovy way to get the directory of the script itself. */ | |
def path = new File(getClass().protectionDomain.codeSource.location.path).parent | |
def database = Paths.get(path, "jmdict.sqlite") | |
final createPOSTableQuery = """CREATE TABLE IF NOT EXISTS pos | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
pos TEXT NOT NULL)""" | |
final createKanaTableQuery = """CREATE TABLE IF NOT EXISTS kana | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
kana TEXT NOT NULL)""" | |
final createKanjiTableQuery = """CREATE TABLE IF NOT EXISTS kanji | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
kanji TEXT NOT NULL)""" | |
final createTranslationTableQuery = """CREATE TABLE IF NOT EXISTS definition | |
(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
definition TEXT NOT NULL, | |
kana_id INTEGER, | |
kanji_id INTEGER, | |
pos_id INTEGER, | |
CONSTRAINT kana_id FOREIGN KEY(id) REFERENCES kana(id), | |
CONSTRAINT kanji_id FOREIGN KEY(id) REFERENCES kanji(id), | |
CONSTRAINT pos_id FOREIGN KEY(id) REFERENCES pos(id))""" | |
/* Trick to use SQLite JDBC. */ | |
Class.forName("org.sqlite.JDBC") | |
groovy.sql.Sql conn = null | |
/* Create the tables in the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
conn.execute(createPOSTableQuery) | |
conn.execute(createKanaTableQuery) | |
conn.execute(createKanjiTableQuery) | |
conn.execute(createTranslationTableQuery) | |
} | |
catch (SQLException e) | |
{ | |
println(e.getMessage()) | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
def pos = ["noun", "pronoun", "verb", "adjective", "adjectival noun", "adverb", | |
"auxiliary verb", "particle", "conjunction", "interjection", "attributive", | |
"auxiliary", "unclassified"] | |
/* Write the PoS data into the database. */ | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
for (String p : pos) { | |
final insertPOSQuery = | |
"""INSERT INTO pos (pos) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM pos WHERE pos=?)""" | |
conn.execute insertPOSQuery, p, p | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} | |
/* Read the dictionary file as a XML object. */ | |
def dict = new XmlSlurper(false, false, true).parse(new File("JMdict_e")) | |
final insertKanaQuery = | |
"""INSERT INTO kana (kana) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM kana WHERE kana=?)""" | |
final insertKanjiQuery = | |
"""INSERT INTO kanji (kanji) SELECT ? | |
WHERE NOT EXISTS (SELECT id FROM kanji WHERE kanji=?)""" | |
final insertTransQuery = | |
"""INSERT INTO definition (definition, kana_id, pos_id) SELECT ?,?,? | |
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND kana_id=? AND pos_id=?)""" | |
final insertTransWithKanjiQuery = | |
"""INSERT INTO definition (definition, kana_id, kanji_id, pos_id) SELECT ?,?,?,? | |
WHERE NOT EXISTS (SELECT id FROM definition WHERE definition=? AND kana_id=? AND kanji_id=? AND pos_id=?)""" | |
final selectKanaQuery = "SELECT id FROM kana WHERE kana=?" | |
final selectPOSQuery = "SELECT id FROM pos WHERE pos=?" | |
final selectKanjiQuery = "SELECT id FROM kanji WHERE kanji=?" | |
System.setOut(new PrintStream(System.out, true, "UTF-8")) | |
try { | |
conn = groovy.sql.Sql.newInstance("jdbc:sqlite:${database}", "org.sqlite.JDBC") | |
/* Iterate over the entries of the dictionary. */ | |
dict.entry.each { entry -> | |
/* Extract kana(s) from an entry. */ | |
def kana = [] | |
entry.r_ele.each { r_ele -> | |
kana << r_ele.reb.toString() | |
} | |
/* Extract kanji(s) from an entry. */ | |
def kanji = [] | |
entry.k_ele.each { k_ele -> | |
kanji << k_ele.keb.toString() | |
} | |
/* Extract translation(s) from an entry. */ | |
def posList = [] | |
entry.sense.each { sense -> | |
def glossList = [] | |
if (sense.pos.size() > 0) { | |
posList = [] | |
sense.pos.each { p -> | |
posList << p.toString() | |
} | |
} | |
sense.gloss.each { gloss -> | |
glossList << gloss.toString() | |
} | |
def definition = String.join("; ", glossList) | |
System.out.println "kana: ${kana}" | |
System.out.println "kanji: ${kanji}" | |
System.out.println "pos: ${posList}" | |
if (kanji.size() <= 0) { | |
kana.each { na -> | |
posList.each { p -> | |
System.out.println "Record: ${na} ${p}" | |
conn.execute insertKanaQuery, na, na | |
def kana_id | |
conn.query(selectKanaQuery, [na]) { result -> | |
while (result.next()) { | |
kana_id = result.getInt('id') | |
break | |
} | |
} | |
def pos_id = -1 | |
conn.query(selectPOSQuery, [posTrans(p)]) { result -> | |
while (result.next()) { | |
pos_id = result.getInt('id') | |
break | |
} | |
} | |
if (pos_id <= 0) { | |
return | |
} | |
conn.execute insertTransQuery, definition, kana_id, pos_id, definition, kana_id, pos_id | |
} | |
} | |
} | |
else { | |
kana.each { na -> | |
kanji.each { ji -> | |
posList.each { p -> | |
System.out.println "Record: ${na} ${ji} ${p}" | |
conn.execute insertKanaQuery, na, na | |
def kana_id | |
conn.query(selectKanaQuery, [na]) { result -> | |
while (result.next()) { | |
kana_id = result.getInt('id') | |
break | |
} | |
} | |
def pos_id = -1 | |
conn.query(selectPOSQuery, [posTrans(p)]) { result -> | |
while (result.next()) { | |
pos_id = result.getInt('id') | |
break | |
} | |
} | |
if (pos_id <= 0) { | |
return | |
} | |
conn.execute insertKanjiQuery, ji, ji | |
def kanji_id = -1 | |
conn.query(selectKanjiQuery, [ji]) { result -> | |
while (result.next()) { | |
kanji_id = result.getInt('id') | |
break | |
} | |
} | |
if (kanji_id <= 0) { | |
return | |
} | |
conn.execute insertTransWithKanjiQuery, definition, kana_id, kanji_id, pos_id, definition, kana_id, kanji_id, pos_id | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
finally { | |
try { | |
if (null != conn) | |
conn.close() | |
} | |
catch (SQLException e) { | |
println(e.getMessage()) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment