Skip to content

Instantly share code, notes, and snippets.

View strubell's full-sized avatar

Emma Strubell strubell

View GitHub Profile
@strubell
strubell / modified function
Created February 12, 2014 16:38
maxtrix, vector left multiply vs. left multiply + sum + max
def leftMultiplyAndSumAndMax(t: Tensor1, v: DenseTensor1): (DenseTensor1, Double) = {
assert(dim1 == t.dim1, "Dimensions don't match: " + dim1 + " " + t.dim1)
val myDim2 = dim2
val newT = v.copy
val newArray = newT.asArray
var max = Double.MinValue
var currentVal = 0.0
t match {
case t: DenseTensor =>
val tArr = t.asArray
@strubell
strubell / gist:ed0e5091baf89038e19e
Created June 25, 2014 16:37
Example string causing factorie/factorie issue #168
<meta content="Read Featured Article “YesAsia 2011 Bestsellers - Movies” by YumCha! Editorial Team mentioning Donnie Yen,Shawn Yue,Jung Woo Sung,Michelle Yeoh,Jiang Wen,Andy Lau,Kaneshiro Takeshi,Yamazaki Takashi,Benny Chan,Won Bin,Kim Jee Woon,Kimura Takuya,Tsui Hark,Su Chao Pin,Barbie Hsu,Wang Xue Qi,John Woo,Peter Chan, &quot;Villain (DVD) (English Subtitled) (Hong Kong Version)&quot;,&quot;Let The Bullets Fly (2010) (Blu-ray + DVD) (Hong Kong Version)&quot;,&quot;Villain (Blu-ray) (English Subtitled) (Hong Kong Version)&quot;,&quot;The Borrower Arrietty (DVD) (English Subtitled) (2-Disc Edition) (Hong Kong Version)&quot;,&quot;Shaolin (2011) (Blu-ray) (English Subtitled) (Hong Kong Version)&quot;,&quot;A Better Tomorrow (2010) (DVD) (First Press Limited Edition) (Korea Version)&quot;,&quot;Outrage (Blu-ray) (English Subtitled) (Hong Kong Version)&quot;,&quot;The Housemaid (2010) (DVD) (Single Disc) (Korea Version)&quot;,&quot;Bruce Lee My Brother (Blu-ray) (Hong Kong Version)&quot;,&quot;Space Battleship
class ProcessSlotFillingCorpusOpts extends CmdOptions {
val dataDirs = new CmdOption[List[String]]("data-dirs", List.empty[String], "FILENAME...", "List of directories containing (only) data files in sgml format.")
val dataFiles = new CmdOption[List[String]]("data-files", List.empty[String], "FILENAME...", "List of files in sgml format.")
val dataFilesFile = new CmdOption("data-files-file", "", "FILENAME", "File containing a list of paths to data files, one per line.")
val reprocess = new CmdOption("reprocess", false, "BOOL", "Whether to re-process documents that we find to be already serialized.")
val outputDir = new CmdOption("output-dir", "", "FILENAME", "Directory to which to serialize processed docs")
val inputType = new CmdOption("input-type", "filename", "STRING", "Type of the input: filename, docid, document. Document assumes plain text serialized documents, filename is tac corpus filenames, docid is lookup into original corpus by docid.")
val retag = new CmdOption("retag-type", "none",
/** Return a string that captures the generic "shape" of the original word,
mapping lowercase alphabetics to 'a', uppercase to 'A', digits to '1', whitespace to ' '.
Skip more than 'maxRepetitions' of the same character class. */
def stringShape(word:String, maxRepetitions:Int): String = {
val sb = new StringBuffer
var i = 0; var c = 'x'; var prevc = 'x'; var repetitions = 0
while (i < word.length) {
val char = word(i)
if (Character.isUpperCase(char)) c = 'A'
else if (Character.isLowerCase(char)) c = 'a'
@strubell
strubell / gist:700e30c45721b0120165
Created January 20, 2015 20:26
qsubbing using factorie
object ProcessSlotFillingCorpusParallel {
def main(args: Array[String]) {
implicit val random = new scala.util.Random(0)
val opts = new ProcessSlotFillingCorpusParallelOpts
opts.parse(args)
/* Load data files */
if (opts.dataDirs.wasInvoked && opts.dataFiles.wasInvoked) {
println("Please specify either a list of data directories or files but not both.")
@strubell
strubell / ChainNerDemo.scala
Created March 24, 2015 17:32
ChainNerDemo that works with unseen labels at test time
package cc.factorie.tutorial
import cc.factorie._
import java.io.File
import cc.factorie.variable._
import cc.factorie.model.{Parameters, DotTemplateWithStatistics2, DotTemplateWithStatistics1, TemplateModel}
import cc.factorie.infer.{BPSummary, BP, IteratedConditionalModes, GibbsSampler}
/** A demonstration of training a linear-chain CRF for named entity recognition.
Prints various diagnostics suitable to a demo.
def serialize(stream: java.io.OutputStream): Unit = {
import CubbieConversions._
val dstream = new java.io.DataOutputStream(new BufferedOutputStream(stream))
domains.foreach{domain => BinarySerializer.serialize(domain.dimensionDomain, dstream)}
BinarySerializer.serialize(model, dstream)
BinarySerializer.serialize(WordData.ambiguityClasses, dstream)
BinarySerializer.serialize(WordData.sureTokens, dstream)
BinarySerializer.serialize(WordData.docWordCounts, dstream)
// don't have implicits for serializing set, so convert to a map
val obsWordsMap = collection.mutable.HashMap[String,String]()
@strubell
strubell / rpp-stack-trace
Created May 30, 2015 17:29
RPP stack trace
org.jdom2.input.JDOMParseException: Error on line 1 of document file:/iesl/canvas/ksilvers/eval/data/acl/full-pdfs/E09-1037.pdf: Content is not allowed in prolog.
at org.jdom2.input.sax.SAXBuilderEngine.build(SAXBuilderEngine.java:228)
at org.jdom2.input.sax.SAXBuilderEngine.build(SAXBuilderEngine.java:277)
at org.jdom2.input.sax.SAXBuilderEngine.build(SAXBuilderEngine.java:264)
at org.jdom2.input.SAXBuilder.build(SAXBuilder.java:1116)
at edu.umass.cs.iesl.rpp.Main$.process(Main.scala:16)
at edu.umass.cs.iesl.rpp.BatchMain$$anonfun$main$1.apply(BatchMain.scala:47)
at edu.umass.cs.iesl.rpp.BatchMain$$anonfun$main$1.apply(BatchMain.scala:45)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
java.lang.IndexOutOfBoundsException: -1
at scala.collection.immutable.Vector.checkRangeConvert(Vector.scala:132)
at scala.collection.immutable.Vector.apply(Vector.scala:122)
at edu.umass.cs.iesl.xml_annotator.Annotator.pair2Total(Annotator.scala:561)
at edu.umass.cs.iesl.xml_annotator.Annotator$$anonfun$28.apply(Annotator.scala:964)
at edu.umass.cs.iesl.xml_annotator.Annotator$$anonfun$28.apply(Annotator.scala:962)
at scala.collection.immutable.HashMap$HashMap1.filter0(HashMap.scala:218)
at scala.collection.immutable.HashMap$HashTrieMap.filter0(HashMap.scala:385)
at scala.collection.immutable.HashMap$HashTrieMap.filter0(HashMap.scala:385)
at scala.collection.immutable.HashMap.filter(HashMap.scala:70)
@strubell
strubell / pdf2svg-parallel.sh
Created June 1, 2015 00:51
Convert a bunch of files from pdf -> svg using iesl-pdf-to-text
#!/bin/bash
#
# pdf2svg-parallel.sh
#
# Usage: pdf2svg-parallel.sh /dir/containing/pdfs /output/dir
#
# You must set the PDF2SVG_ROOT environment variable to a directory containing
# iesl-pdf-to-text, e.g. from that directory:
# export PDF2SVG_ROOT=`pwd`
#