This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This code is related to PR https://github.com/apache/spark/pull/17461 | |
// I show how to use the setInitialModel() param of LDA to build a model incrementally, | |
// and I compare the performance (perplexity) with a model built in one-shot | |
import scala.collection.mutable | |
import org.apache.spark.ml.{Pipeline, PipelineModel} | |
import org.apache.spark.ml.clustering.{LDA, LDAModel} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
filenames = ["hdfs://10.152.104.73:8020/sogou/train_data/1_final.feature_transform"] | |
dataset = tf.data.TextLineDataset(filenames) | |
iterator = dataset.make_one_shot_iterator() | |
next_batch = iterator.get_next() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import numpy as np | |
corpus_raw = 'He is the king . The king is royal . She is the royal queen ' | |
# convert to lower case | |
corpus_raw = corpus_raw.lower() | |
words = [] | |
for word in corpus_raw.split(): |