Created
April 23, 2018 01:17
-
-
Save yoyama/225d1d6d6a3599afe6eaf6fca2530adb to your computer and use it in GitHub Desktop.
spar-corenlp と自前でUDFを定義したときの比較用
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package test1 { | |
import com.databricks.spark.corenlp.functions._ | |
import org.apache.spark.sql.{DataFrame, SparkSession} | |
class Test(implicit spark: SparkSession) { | |
def run(df: DataFrame): DataFrame = { | |
import spark.implicits._ | |
val df2 = df.select(tokenize($"Text").as("word"), lemma($"Text").as("lemma"), pos($"Text").as("pos")).persist | |
df2.count | |
df2.unpersist() | |
} | |
} | |
class Test2(implicit spark: SparkSession) { | |
def run(df: DataFrame): DataFrame = { | |
import spark.implicits._ | |
val df2 = df.select( lemma($"Text").as("lemma")).persist | |
df2.count | |
df2.unpersist() | |
} | |
} | |
} | |
package test2 { | |
import java.util.Properties | |
import org.apache.spark.sql.functions.udf | |
import edu.stanford.nlp.ling.CoreAnnotations.{SentencesAnnotation, TokensAnnotation} | |
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP} | |
import org.apache.spark.sql.{DataFrame, SparkSession} | |
import scala.collection.JavaConverters._ | |
case class ann(word:String, lemma:String, pos:String) | |
object NLProc { | |
val props = new Properties() | |
props.setProperty("annotators", "tokenize,ssplit,pos,lemma") | |
@transient val pipeline = new StanfordCoreNLP(props); | |
val nlproc_f = (text: String) => { | |
val document = new Annotation(text) | |
pipeline.annotate(document) | |
document.get(classOf[SentencesAnnotation]).asScala flatMap { s => | |
s.get(classOf[TokensAnnotation]).asScala map { t => | |
ann(t.word(), t.lemma(), t.tag()) | |
} | |
} | |
}:Seq[ann] | |
val nlproc = udf(nlproc_f) | |
} | |
class Test(implicit spark:SparkSession){ | |
def run(df:DataFrame):DataFrame = { | |
import spark.implicits._ | |
val df2 = df.select(NLProc.nlproc($"Text").as("nlp")).persist() | |
df2.count() | |
df2.unpersist() | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment