Skip to content

Instantly share code, notes, and snippets.

View wandabwa2004's full-sized avatar

Herman Wandabwa wandabwa2004

View GitHub Profile
Top20VerticesRankedbyBetweenness-Centrality Betweenness-Centrality Betweenness-Centrality(%)
oleitumbi 293985.8683 31.158438
bcollapsed 52009.4251 5.512280
informer_ke 50096.89302 5.309578
mbuimumbi 48011.00835 5.088503
kasibajohnrich 47221.56874 5.004833
wens87366885 45944.92043 4.869526
mattoduor 41294.73213 4.376671
udakenya 41147.55715 4.361072
mrodili3 37091.14136 3.931148
#A little clean up
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
#remove links or anything starting with http
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('http.*','',str(x)))
#remove hashtags
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'#','',str(x)))
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'@\w*','',str(x)))
dayTypeGroupedData = data.groupby(['Is Weekday']).mean()
dayofweek_grouped_data = data.groupby(['Day_of_week']).mean()
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
visualization_q1 = pyLDAvis.gensim.prepare(lda_model_bigrams_q1, bi_corpus_q1, dictionary_bi_q1, mds='mmds') #prepares the model, related corpus and dictionary.
visualization_q1 #Visualizes the plot
#To save the visualization
pyLDAvis.save_html(visualization_q1, 'lda_q1.html')
#sample topics quarter 2
for idx, topic in lda_model_bigrams_q2.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
#Training of LDA topic models on bi-grams for each of the subsets
lda_model_bigrams_q1 = gensim.models.LdaMulticore(bi_corpus_q1, num_topics=10, id2word=dictionary_bi_q1, passes=2, workers=10)
lda_model_bigrams_q2 = gensim.models.LdaMulticore(bi_corpus_q2, num_topics=10, id2word=dictionary_bi_q2, passes=2, workers=10)
lda_model_bigrams_q3 = gensim.models.LdaMulticore(bi_corpus_q3, num_topics=10, id2word=dictionary_bi_q3, passes=2, workers=10)
lda_model_bigrams_q4 = gensim.models.LdaMulticore(bi_corpus_q4, num_topics=10, id2word=dictionary_bi_q4, passes=2, workers=10)
#Bi-gram dictionary generation process
dictionary_bi_q1 = gensim.corpora.Dictionary(bigram_mod_q1[data_words_q1])
dictionary_bi_q2 = gensim.corpora.Dictionary(bigram_mod_q2[data_words_q2])
dictionary_bi_q3 = gensim.corpora.Dictionary(bigram_mod_q3[data_words_q3])
dictionary_bi_q4 = gensim.corpora.Dictionary(bigram_mod_q4[data_words_q4])
#Bigram corpus generation process from the dictionary of the subsets.
bi_corpus_q1 = [dictionary_bi_q1.doc2bow(doc) for doc in bigram_mod_q1[data_words_q1]]
##Bigrams models for each quarter.
bigram_q1 = gensim.models.Phrases(data_words_q1, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q2 = gensim.models.Phrases(data_words_q2, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q3 = gensim.models.Phrases(data_words_q3, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q4 = gensim.models.Phrases(data_words_q4, min_count=3, threshold=100) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a bigram
bigram_mod_q1 = gensim.models.phrases.Phraser(bigram_q1)
data_words_q1 = list(q1['tokens'])
data_words_q2 = list(q2['tokens'])
data_words_q3 = list(q3['tokens'])
data_words_q4 = list(q4['tokens'])