Skip to content

Instantly share code, notes, and snippets.

@yoyama
Created April 23, 2018 01:17
Show Gist options
  • Save yoyama/225d1d6d6a3599afe6eaf6fca2530adb to your computer and use it in GitHub Desktop.
Save yoyama/225d1d6d6a3599afe6eaf6fca2530adb to your computer and use it in GitHub Desktop.
spar-corenlp と自前でUDFを定義したときの比較用
package test1 {
import com.databricks.spark.corenlp.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
class Test(implicit spark: SparkSession) {
def run(df: DataFrame): DataFrame = {
import spark.implicits._
val df2 = df.select(tokenize($"Text").as("word"), lemma($"Text").as("lemma"), pos($"Text").as("pos")).persist
df2.count
df2.unpersist()
}
}
class Test2(implicit spark: SparkSession) {
def run(df: DataFrame): DataFrame = {
import spark.implicits._
val df2 = df.select( lemma($"Text").as("lemma")).persist
df2.count
df2.unpersist()
}
}
}
package test2 {
import java.util.Properties
import org.apache.spark.sql.functions.udf
import edu.stanford.nlp.ling.CoreAnnotations.{SentencesAnnotation, TokensAnnotation}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._
case class ann(word:String, lemma:String, pos:String)
object NLProc {
val props = new Properties()
props.setProperty("annotators", "tokenize,ssplit,pos,lemma")
@transient val pipeline = new StanfordCoreNLP(props);
val nlproc_f = (text: String) => {
val document = new Annotation(text)
pipeline.annotate(document)
document.get(classOf[SentencesAnnotation]).asScala flatMap { s =>
s.get(classOf[TokensAnnotation]).asScala map { t =>
ann(t.word(), t.lemma(), t.tag())
}
}
}:Seq[ann]
val nlproc = udf(nlproc_f)
}
class Test(implicit spark:SparkSession){
def run(df:DataFrame):DataFrame = {
import spark.implicits._
val df2 = df.select(NLProc.nlproc($"Text").as("nlp")).persist()
df2.count()
df2.unpersist()
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment