Skip to content

Instantly share code, notes, and snippets.

@ululh
Last active May 22, 2021 01:27
Show Gist options
  • Save ululh/d00ab305e937d8eba3ba963bd558d674 to your computer and use it in GitHub Desktop.
Save ululh/d00ab305e937d8eba3ba963bd558d674 to your computer and use it in GitHub Desktop.
LDA (Latent Dirichlet Allocation) fitting with python scikit-learn
# derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
# explanations are located there : https://www.linkedin.com/pulse/dissociating-training-predicting-latent-dirichlet-lucien-tardres
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
n_features = 50
n_topics = 2
# Training dataset
data_samples = ["I like to eat broccoli and bananas.",
"I ate a banana and spinach smoothie for breakfast.",
"Chinchillas and kittens are cute.",
"My sister adopted a kitten yesterday.",
"Look at this cute hamster munching on a piece of broccoli."
]
# extract fetures and vectorize dataset
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
max_features=n_features,
stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)
#save features
dic = tf_vectorizer.get_feature_names()
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
# train LDA
p1 = lda.fit(tf)
# Save all data necessary for later prediction
model = (dic,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_)
with open('outfile', 'wb') as fp:
pickle.dump(model, fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment