Herman Wandabwa wandabwa2004

## betweennesscentrality.csv

          
            Top20VerticesRankedbyBetweenness-Centrality
            Betweenness-Centrality
            Betweenness-Centrality(%)

            
              oleitumbi
              293985.8683
              31.158438

            
              bcollapsed
              52009.4251
              5.512280

            
              informer_ke
              50096.89302
              5.309578

            
              mbuimumbi
              48011.00835
              5.088503

            
              kasibajohnrich
              47221.56874
              5.004833

            
              wens87366885
              45944.92043
              4.869526

            
              mattoduor
              41294.73213
              4.376671

            
              udakenya
              41147.55715
              4.361072

            
              mrodili3
              37091.14136
              3.931148

## cleanuptweets.py
#A little clean up
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
#remove links or anything starting with http
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('http.*','',str(x)))
#remove hashtags
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'#','',str(x)))
df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'@\w*','',str(x)))

## groups.py
dayTypeGroupedData = data.groupby(['Is Weekday']).mean()
dayofweek_grouped_data = data.groupby(['Day_of_week']).mean()

## date_related_manipulations.py
#Breakdown Date fields and convert other fields to integer
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
data['Day_of_week'] = pd.DatetimeIndex(data['Date']).dayofweek
data["Is Weekday"] = (data['Day_of_week'] < 5)
data["Is Weekend"] = (data['Day_of_week'] > 4)
days = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
DayCodes = ['','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

## plot_lda.py
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
visualization_q1 = pyLDAvis.gensim.prepare(lda_model_bigrams_q1, bi_corpus_q1, dictionary_bi_q1, mds='mmds') #prepares the  model, related corpus and  dictionary.
visualization_q1 #Visualizes the plot

#To save the visualization
pyLDAvis.save_html(visualization_q1, 'lda_q1.html')

## sample_topics.py
#sample topics quarter 2
for idx, topic in lda_model_bigrams_q2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

## lda_training.py
#Training of LDA topic models on bi-grams for each of the subsets

lda_model_bigrams_q1 = gensim.models.LdaMulticore(bi_corpus_q1, num_topics=10, id2word=dictionary_bi_q1, passes=2, workers=10)
lda_model_bigrams_q2 = gensim.models.LdaMulticore(bi_corpus_q2, num_topics=10, id2word=dictionary_bi_q2, passes=2, workers=10)
lda_model_bigrams_q3 = gensim.models.LdaMulticore(bi_corpus_q3, num_topics=10, id2word=dictionary_bi_q3, passes=2, workers=10)
lda_model_bigrams_q4 = gensim.models.LdaMulticore(bi_corpus_q4, num_topics=10, id2word=dictionary_bi_q4, passes=2, workers=10)

## bigram_generation.py
#Bi-gram dictionary generation process

dictionary_bi_q1 = gensim.corpora.Dictionary(bigram_mod_q1[data_words_q1])
dictionary_bi_q2 = gensim.corpora.Dictionary(bigram_mod_q2[data_words_q2])
dictionary_bi_q3 = gensim.corpora.Dictionary(bigram_mod_q3[data_words_q3])
dictionary_bi_q4 = gensim.corpora.Dictionary(bigram_mod_q4[data_words_q4])

#Bigram corpus generation process from the  dictionary of  the subsets.

bi_corpus_q1 =  [dictionary_bi_q1.doc2bow(doc) for doc in bigram_mod_q1[data_words_q1]]

## n_grams.py

##Bigrams models for  each quarter.

bigram_q1 = gensim.models.Phrases(data_words_q1, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q2 = gensim.models.Phrases(data_words_q2, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q3 = gensim.models.Phrases(data_words_q3, min_count=3, threshold=100) # higher threshold fewer phrases.
bigram_q4 = gensim.models.Phrases(data_words_q4, min_count=3, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod_q1 = gensim.models.phrases.Phraser(bigram_q1)

## lists.py
data_words_q1 = list(q1['tokens'])
data_words_q2 = list(q2['tokens'])
data_words_q3 = list(q3['tokens'])
data_words_q4 = list(q4['tokens'])
Top20VerticesRankedbyBetweenness-Centrality	Betweenness-Centrality	Betweenness-Centrality(%)
oleitumbi	293985.8683	31.158438
bcollapsed	52009.4251	5.512280
informer_ke	50096.89302	5.309578
mbuimumbi	48011.00835	5.088503
kasibajohnrich	47221.56874	5.004833
wens87366885	45944.92043	4.869526
mattoduor	41294.73213	4.376671
udakenya	41147.55715	4.361072
mrodili3	37091.14136	3.931148
	#A little clean up
	df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
	#remove links or anything starting with http
	df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub('http.*','',str(x)))
	#remove hashtags
	df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'#','',str(x)))
	df_final_copy['tweet'] = df_final_copy['tweet'].map(lambda x:re.sub(r'@\w*','',str(x)))
	dayTypeGroupedData = data.groupby(['Is Weekday']).mean()
	dayofweek_grouped_data = data.groupby(['Day_of_week']).mean()
	#Breakdown Date fields and convert other fields to integer
	data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
	data['Day_of_week'] = pd.DatetimeIndex(data['Date']).dayofweek
	data["Is Weekday"] = (data['Day_of_week'] < 5)
	data["Is Weekend"] = (data['Day_of_week'] > 4)
	days = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
	DayCodes = ['','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
	import pyLDAvis
	import pyLDAvis.gensim

	pyLDAvis.enable_notebook()
	visualization_q1 = pyLDAvis.gensim.prepare(lda_model_bigrams_q1, bi_corpus_q1, dictionary_bi_q1, mds='mmds') #prepares the model, related corpus and dictionary.
	visualization_q1 #Visualizes the plot

	#To save the visualization
	pyLDAvis.save_html(visualization_q1, 'lda_q1.html')
	#sample topics quarter 2
	for idx, topic in lda_model_bigrams_q2.print_topics(-1):
	print('Topic: {} \nWords: {}'.format(idx, topic))
	#Training of LDA topic models on bi-grams for each of the subsets

	lda_model_bigrams_q1 = gensim.models.LdaMulticore(bi_corpus_q1, num_topics=10, id2word=dictionary_bi_q1, passes=2, workers=10)
	lda_model_bigrams_q2 = gensim.models.LdaMulticore(bi_corpus_q2, num_topics=10, id2word=dictionary_bi_q2, passes=2, workers=10)
	lda_model_bigrams_q3 = gensim.models.LdaMulticore(bi_corpus_q3, num_topics=10, id2word=dictionary_bi_q3, passes=2, workers=10)
	lda_model_bigrams_q4 = gensim.models.LdaMulticore(bi_corpus_q4, num_topics=10, id2word=dictionary_bi_q4, passes=2, workers=10)
	#Bi-gram dictionary generation process

	dictionary_bi_q1 = gensim.corpora.Dictionary(bigram_mod_q1[data_words_q1])
	dictionary_bi_q2 = gensim.corpora.Dictionary(bigram_mod_q2[data_words_q2])
	dictionary_bi_q3 = gensim.corpora.Dictionary(bigram_mod_q3[data_words_q3])
	dictionary_bi_q4 = gensim.corpora.Dictionary(bigram_mod_q4[data_words_q4])

	#Bigram corpus generation process from the dictionary of the subsets.

	bi_corpus_q1 = [dictionary_bi_q1.doc2bow(doc) for doc in bigram_mod_q1[data_words_q1]]

	##Bigrams models for each quarter.

	bigram_q1 = gensim.models.Phrases(data_words_q1, min_count=3, threshold=100) # higher threshold fewer phrases.
	bigram_q2 = gensim.models.Phrases(data_words_q2, min_count=3, threshold=100) # higher threshold fewer phrases.
	bigram_q3 = gensim.models.Phrases(data_words_q3, min_count=3, threshold=100) # higher threshold fewer phrases.
	bigram_q4 = gensim.models.Phrases(data_words_q4, min_count=3, threshold=100) # higher threshold fewer phrases.

	# Faster way to get a sentence clubbed as a bigram
	bigram_mod_q1 = gensim.models.phrases.Phraser(bigram_q1)
	data_words_q1 = list(q1['tokens'])
	data_words_q2 = list(q2['tokens'])
	data_words_q3 = list(q3['tokens'])
	data_words_q4 = list(q4['tokens'])