Skip to content

Instantly share code, notes, and snippets.

@allatambov
Last active June 28, 2018 12:32
Show Gist options
  • Save allatambov/b164697ca61624d5c9d0f5900bb554b9 to your computer and use it in GitHub Desktop.
Save allatambov/b164697ca61624d5c9d0f5900bb554b9 to your computer and use it in GitHub Desktop.
import pandas as pd
df = pd.read_csv('articles.csv', encoding= 'UTF-8')
# normalization
import string
def normalize(x):
to_remove = string.punctuation + '«»—'
translator = str.maketrans('', '', to_remove)
res = x.translate(translator)
res = res.lower()
return res
# filtering
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
def filter_words(text, lang = 'russian'):
wordsFiltered = []
stopWords = set(stopwords.words(lang))
words = word_tokenize(text)
for w in words:
if w not in stopWords:
wordsFiltered.append(w)
return wordsFiltered
df['text_norm'] = df.text.apply(normalize)
df['words'] = df.text_norm.apply(filter_words)
df
# gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
sentences = list(df.words)
labels = range(0, len(sentences))
from gensim.models.doc2vec import TaggedDocument
docs = []
for i, sent in enumerate(sentences):
docs.append(TaggedDocument(sent, [i]))
docs
model = Doc2Vec()
model
model.build_vocab(docs)
for epoch in range(10):
model.train(docs)
model.alpha -= 0.002 # decrease the learning rate`
model.min_alpha = model.alpha
model.save('d2v_model')
model = Doc2Vec.load('d2v_model')
model.corpus_count
model.docvecs.most_similar([0])
# your turn
file your_turn.csv
d = pd.read_csv('your_turn.csv')
d
from gensim.models.ldamodel import LdaModel
from gensim import corpora
d['text_norm'] = d.post.apply(normalize)
d['words'] = d.text_norm.apply(filter_words)
texts = list(d.words)
dict_ = corpora.Dictionary(texts)
corpus = [dict_.doc2bow(text) for text in texts]
lda = LdaModel(corpus, num_topics=5, id2word = dict_)
lda.print_topics(5, num_words=30)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment