Last active
August 29, 2015 14:23
-
-
Save k8si/159be8027acf99c74b46 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io._ | |
import cc.factorie.app.nlp.Document | |
object TestStuff { | |
def serializeStuff(): Unit = { | |
class Thing(s: String) extends Serializable { | |
override def toString: String = s"Thing($s)" | |
} | |
def serializeDoc(doc: Document, filename: String): String = { | |
val fileOut = new FileOutputStream(filename) | |
val out = new ObjectOutputStream(fileOut) | |
out.writeObject(doc) | |
out.close(); fileOut.close() | |
filename | |
} | |
def deserializeDoc(filename: String): Document = { | |
val fileIn = new FileInputStream(filename) | |
val in = new ObjectInputStream(fileIn) | |
val doc: Document = in.readObject().asInstanceOf[Document] | |
in.close(); fileIn.close() | |
doc | |
} | |
val str = "Barack Obama spoke yesterday" | |
val doc = new Document(str) | |
cc.factorie.app.nlp.segment.DeterministicTokenizer.process(doc) | |
doc.attr += new Thing("thing") | |
println("original string: " + doc.string) | |
println("original attr: " + doc.attr.toString) | |
println("original # tokens: " + doc.tokens.size) | |
println("original tokens:") | |
doc.tokens.foreach { t => | |
t.attr += new Thing("token-thing") | |
println(t.string + " attr: " + t.attr.toString) | |
} | |
/** Output | |
* | |
* original string: Barack Obama spoke yesterday | |
* original attr: Thing(thing) | |
* original # tokens: 4 | |
* original tokens: | |
* Barack attr: Thing(token-thing) | |
* Obama attr: Thing(token-thing) | |
* spoke attr: Thing(token-thing) | |
* yesterday attr: Thing(token-thing) | |
* | |
*/ | |
println("") | |
println("serializing...") | |
val fname = serializeDoc(doc, "test.ser") | |
println("deserializing...") | |
val newDoc = deserializeDoc(fname) | |
println("") | |
println("deserialized string: " + newDoc.string) | |
println("deserialized attr: " + newDoc.attr.toString) | |
println("deserialized # tokens: " + newDoc.tokens.size) | |
println("deserialized tokens:") | |
newDoc.tokens.foreach { t => println(t.string + " attr: " + t.attr.toString) } | |
/** Output (WITHOUT custom {read/write}Object on Document and Token) | |
* | |
* deserialized string: Barack Obama spoke yesterday | |
* deserialized attr: | |
* deserialized # tokens: 4 | |
* deserialized tokens: | |
* Barack attr: | |
* Obama attr: | |
* spoke attr: | |
* yesterday attr: | |
* | |
*/ | |
/** Output (WITH custom {read/write}Object on Document and Token) | |
* | |
* deserialized string: Barack Obama spoke yesterday | |
* deserialized attr: Thing(thing) | |
* deserialized # tokens: 4 | |
* deserialized tokens: | |
* Barack attr: Thing(token-thing) | |
* Obama attr: Thing(token-thing) | |
* spoke attr: Thing(token-thing) | |
* yesterday attr: Thing(token-thing) | |
* | |
*/ | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment