Skip to content

Instantly share code, notes, and snippets.

@megafarad
Created October 8, 2022 00:25
Show Gist options
  • Save megafarad/92135b634db040eb801160aabab82dcd to your computer and use it in GitHub Desktop.
Save megafarad/92135b634db040eb801160aabab82dcd to your computer and use it in GitHub Desktop.
package com.megafarad.jmdictapi
import com.mongodb.client.model.ReplaceOptions
import java.io.FileInputStream
import javax.xml.stream._
import org.json._
import org.mongodb.scala._
import org.mongodb.scala.model.Filters
import java.util
import scala.collection.mutable
import scala.concurrent.Await
import scala.concurrent.duration.Duration
object ManualImporter {
def importFile(path: String): Unit = {
val props = System.getProperties
props.setProperty("jdk.xml.entityExpansionLimit", "0")
val xmlInputFactory = XMLInputFactory.newInstance()
val reader = xmlInputFactory.createXMLEventReader(new FileInputStream(path))
val elementBuilder = new mutable.StringBuilder()
val uri: String = "mongodb://localhost:27017"
val client: MongoClient = MongoClient(uri)
val db: MongoDatabase = client.getDatabase("jisho")
val collection: MongoCollection[Document] = db.getCollection("JMdict")
while (reader.hasNext) {
val nextEvent = reader.nextEvent()
if (nextEvent.isStartElement) {
val startElement = nextEvent.asStartElement()
startElement.getName.getLocalPart match {
case "JMdict" => println("START IMPORT")
case _ => elementBuilder.append(startElement.toString)
}
} else if (nextEvent.isCharacters) {
val characters = nextEvent.asCharacters()
elementBuilder.append(characters.toString
.replace("&", "&")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace("\"", "&quot;")
.replace("\'","&apos;"))
} else if (nextEvent.isEndElement) {
val endElement = nextEvent.asEndElement()
endElement.getName.getLocalPart match {
case "JMdict" => println("END IMPORT")
case "entry" =>
elementBuilder.append(endElement.toString)
try {
val forceList = new util.HashSet[String]()
forceList.add("k_ele")
forceList.add("r_ele")
forceList.add("sense")
forceList.add("ke_inf")
forceList.add("ke_pri")
forceList.add("re_restr")
forceList.add("re_inf")
forceList.add("re_pri")
forceList.add("links")
forceList.add("bibl")
forceList.add("etym")
forceList.add("audit")
forceList.add("stagk")
forceList.add("stagr")
forceList.add("pos")
forceList.add("xref")
forceList.add("ant")
forceList.add("field")
forceList.add("misc")
forceList.add("s_inf")
forceList.add("lsource")
forceList.add("dial")
forceList.add("gloss")
forceList.add("example")
//TODO: don't like this. Would rather parse out XML and rework that way...
val cleanXML = elementBuilder.toString().replace("xml:lang=", "lang=")
.replace("<re_nokanji></re_nokanji>", "<re_nokanji>true</re_nokanji>")
val json = org.json.XML.toJSONObject(cleanXML, (new XMLParserConfiguration()).withForceList(forceList))
val entry = json.getJSONObject("entry")
val entSeq = entry.getInt("ent_seq")
println(entry.toString(4))
Await.result(collection.replaceOne(Filters.eq("ent_seq", entSeq), Document(entry.toString), (new ReplaceOptions).upsert(true)).toFuture(), Duration("5s"))
} catch {
case jx: JSONException =>
jx.printStackTrace()
println(elementBuilder.toString())
case other: Exception => other.printStackTrace()
}
elementBuilder.clear()
case _ => elementBuilder.append(endElement)
}
}
}
client.close()
}
def main(args: Array[String]): Unit = {
importFile("C:\\Users\\chris\\Downloads\\JMdict_e_examp.xml")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment