Skip to content

Instantly share code, notes, and snippets.

@dalei2019
Last active December 23, 2020 09:03
Show Gist options
  • Save dalei2019/6996bd4b69a49d9b88a357e184989c1e to your computer and use it in GitHub Desktop.
Save dalei2019/6996bd4b69a49d9b88a357e184989c1e to your computer and use it in GitHub Desktop.
def split(): Unit = {
import org.apache.spark.SparkConf
import org.ansj.library.DicLibrary
import org.ansj.splitWord.analysis.DicAnalysis
import org.apache.spark.sql.{SaveMode, SparkSession}
val conf = new SparkConf().setAppName("HelloWorld")
val spark = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
//存储自定义词库的 HDFS 路径
val dicPath = "/tmp/user.dic"
//自定义词库的 key 名(可以任意指定)
val dicKey = "HelloDic"
//分词测试数据
val str = "杭州排名第一的网红店"
val text = spark.sparkContext.parallelize(Seq(str))
//创建词典
// spark.sparkContext.parallelize(Seq("")).toDF().write.format("csv").mode(SaveMode.Overwrite).save(dicPath)
spark.sparkContext.parallelize(Seq("网红店")).toDF().write.format("csv").mode(SaveMode.Overwrite).save(dicPath)
//加载词典
val dic = sc.textFile(dicPath).cache()
//初始化词典
dic.collect().foreach(line =>
DicLibrary.insertOrCreate(dicKey, line.split(",")(0), DicLibrary.DEFAULT_NATURE, DicLibrary.DEFAULT_FREQ)
)
//广播词典
val userDic = spark.sparkContext.broadcast(DicLibrary.get(dicKey))
//分词
text.mapPartitions(iter => {
iter.map(line =>
DicAnalysis.parse(line, userDic.value).toStringWithOutNature(" ")
)
}).take(1).foreach(println)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment