Skip to content

Instantly share code, notes, and snippets.

@bobbruno
Created January 12, 2018 10:25
Show Gist options
  • Save bobbruno/69dfa9e47cd755f4613722586e96346c to your computer and use it in GitHub Desktop.
Save bobbruno/69dfa9e47cd755f4613722586e96346c to your computer and use it in GitHub Desktop.
Stanford CoreNLP in Spark Scala
/**
This class allows usage of CoreNLP in Spark, creating an instance of the pipeline on each worker so that the
code can run in parallel.
@param annotators: the CoreNLP annotator pipeline
@param params: the parameters desired for the annotators
*/
class NLPPipeline(annotators: String, params: Tuple2[String, String]*) extends Serializable {
import edu.stanford.nlp.pipeline._
import java.util.Properties
@transient private var nlpPipeline: StanfordCoreNLP = _
/**
Returns a CoreNLP pipeline local to the worker, using the constructor parameters
*/
private def getOrCreatePipeline(): StanfordCoreNLP = {
if (nlpPipeline == null) {
val props = new Properties()
props.setProperty("annotators", annotators)
if (params.nonEmpty) params.map{p => props.setProperty(p._1, p._2)}
nlpPipeline = new StanfordCoreNLP(props)
}
nlpPipeline
}
/**
Basic step of the pipeline, transforming any text into a CoreNLP document.
@param keyword: the text to be transformed
*/
def transform(keyword: String) = {
val pipeline = getOrCreatePipeline()
pipeline.process(keyword)
}
}
/**
Example object implementing the lemmatization pipeline
*/
object Lemma extends NLPPipeline("tokenize, ssplit, pos, lemma") {
import edu.stanford.nlp.ling.CoreAnnotations._
import scala.collection.JavaConversions._
/**
Helper class to give nice structure to the results in a DataFrame
*/
case class Lemmas(tokens: Seq[String], lemmas: Seq[String])
/**
udf to run the pipeline on a dataframe column.
*/
def lemmatize = udf((keyword: String) => {
val doc = transform(keyword)
val tokens = doc.get(classOf[SentencesAnnotation]).flatMap(_.get(classOf[TokensAnnotation]))
Lemmas(tokens.map(_.get(classOf[TextAnnotation])), tokens.map(_.get(classOf[LemmaAnnotation])))
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment