Olalekan Fagbuyi leks39

## HOTD 7
#importing word cloud image from directory
mask = np.array(Image.open("C:\\Users\\ofagb\\Downloads\\GOT.jpg"))
plt.figure(figsize = (18,12))
plt.imshow(mask, interpolation='bilinear')
plt.axis('off')
plt.show()

#create word cloud
masked_wordcloud = WordCloud(background_color = 'white', contour_color = 'black', contour_width = 3,
                             min_font_size = 8, max_words = 300, mask = mask)

## HOTD 6
#Calculating text polarity using TextBlob
def polarity(Processed_Text):
    return TextBlob(Processed_Text).sentiment.polarity

#Add polarity score to df
hotd_df2['Polarity'] = hotd_df2['Processed_Text'].apply(polarity)
hotd_df2.head()

#Creating polarity sentiment column
def p_sentiment(label):

## HOTD5
#first step in text processing is to taking out stopwords from the text using NLTK stop_words method
#stop words are common words in english that add little or no value in NLP
stop_words = list(stopwords.words('english'))

#user defined common words that do not add value in the context of the show will also be removed
hotd_stop_words = ['house','GOT','thrones','lady','lord','ser','gameofthrones', 'hotd',
                   'premiere','watching','got','episode','serie','houseofthedragon',
                   'fan','hbo','hbomax','finale', 'end', 'getting', 'today', 'week','watch',
                   'stream', 'houseofdragon', 'houseofthedragonhbo','houseofthedragonep']

## HOTD4
#selecting main characters from HOTD show
hotd_chars = ['rhaenyra', 'daemon','alicent', 'otto', 'viserys', 'rhaenys','vaemond', 'lucerys', 'jacaerys', 'lyonel',
                   'corlys','aegon', 'aemond', 'larys','harwin','laenor', 'laena', 'criston', 'mysaria', 'helaena']

#replacing mispellings and nicknames of character names with actual names
hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyra','rhaenyra')
hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyras','rhaenyra')
hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyratargaryen','rhaenyra')
hotd_df['Text'] = hotd_df['Text'].str.replace('visery','viserys')
hotd_df['Text'] = hotd_df['Text'].str.replace('viseryss','viserys')

## HOTD3
##GETTING TOP TWEET LOCATIONS
#replace empty values with NaN - *this affects the location column
hotd = hotd.replace(r'^\s*$', np.nan, regex=True)
hotd.head()

#filling null values with next value using bfill method - replaces Nan with next value
hotd_df = hotd.fillna(method = 'bfill')
hotd_df.head()

#visualization for top 10 tweet locations

## HOTD2
#hotd1_df - October 2022 tweets - Episode 7 to 10
maxTweets = 200000
# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper
                         ('#HouseoftheDragon since:2022-08-21 until:2022-10-24 lang:"en"').get_items()):
    if i>maxTweets:
        break

## HOTD 1
#import libraries

#core python packages for data analysis and arithmetics
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', None)

#vizualization packages
import seaborn as sns
import matplotlib.pyplot as plt

## Naija_Genre5.py
#Genre Distribution
Genre_Distr = result.Genre_Name.value_counts()
Genre_Distr

#Visualisating Genre distribution
sns.countplot(y = 'Genre_Name', data = result, palette='viridis',
              order = result['Genre_Name'].value_counts().index).set(title='Genre Distribution')

## Naija_Genre4.py
#Splitting songs into 5 genres
kmeans = KMeans(n_clusters = 5)
label = kmeans.fit_predict(pca_scores)
unique_labels = np.unique(label)

#Plotting clusters
for i in unique_labels:
    plt.scatter(pca_scores[label==i,0], pca_scores[label==i,1], label=i, s=80)

plt.legend()

## Naija_Genre3.py
#standardization
scaler = StandardScaler()
audio_features2_std = scaler.fit_transform(audio_features2)

#fitting standardised data using PCA
pca = PCA()
pca.fit(audio_features2_std)

#variance generated by each feature
pca.explained_variance_ratio_
	#importing word cloud image from directory
	mask = np.array(Image.open("C:\\Users\\ofagb\\Downloads\\GOT.jpg"))
	plt.figure(figsize = (18,12))
	plt.imshow(mask, interpolation='bilinear')
	plt.axis('off')
	plt.show()

	#create word cloud
	masked_wordcloud = WordCloud(background_color = 'white', contour_color = 'black', contour_width = 3,
	min_font_size = 8, max_words = 300, mask = mask)
	#Calculating text polarity using TextBlob
	def polarity(Processed_Text):
	return TextBlob(Processed_Text).sentiment.polarity

	#Add polarity score to df
	hotd_df2['Polarity'] = hotd_df2['Processed_Text'].apply(polarity)
	hotd_df2.head()

	#Creating polarity sentiment column
	def p_sentiment(label):
	#first step in text processing is to taking out stopwords from the text using NLTK stop_words method
	#stop words are common words in english that add little or no value in NLP
	stop_words = list(stopwords.words('english'))

	#user defined common words that do not add value in the context of the show will also be removed
	hotd_stop_words = ['house','GOT','thrones','lady','lord','ser','gameofthrones', 'hotd',
	'premiere','watching','got','episode','serie','houseofthedragon',
	'fan','hbo','hbomax','finale', 'end', 'getting', 'today', 'week','watch',
	'stream', 'houseofdragon', 'houseofthedragonhbo','houseofthedragonep']
	#selecting main characters from HOTD show
	hotd_chars = ['rhaenyra', 'daemon','alicent', 'otto', 'viserys', 'rhaenys','vaemond', 'lucerys', 'jacaerys', 'lyonel',
	'corlys','aegon', 'aemond', 'larys','harwin','laenor', 'laena', 'criston', 'mysaria', 'helaena']

	#replacing mispellings and nicknames of character names with actual names
	hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyra','rhaenyra')
	hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyras','rhaenyra')
	hotd_df['Text'] = hotd_df['Text'].str.replace('rhaneyratargaryen','rhaenyra')
	hotd_df['Text'] = hotd_df['Text'].str.replace('visery','viserys')
	hotd_df['Text'] = hotd_df['Text'].str.replace('viseryss','viserys')
	##GETTING TOP TWEET LOCATIONS
	#replace empty values with NaN - *this affects the location column
	hotd = hotd.replace(r'^\s*$', np.nan, regex=True)
	hotd.head()

	#filling null values with next value using bfill method - replaces Nan with next value
	hotd_df = hotd.fillna(method = 'bfill')
	hotd_df.head()

	#visualization for top 10 tweet locations
	#hotd1_df - October 2022 tweets - Episode 7 to 10
	maxTweets = 200000
	# Creating list to append tweet data to
	tweets_list = []

	# Using TwitterSearchScraper to scrape data and append tweets to list
	for i,tweet in enumerate(sntwitter.TwitterSearchScraper
	('#HouseoftheDragon since:2022-08-21 until:2022-10-24 lang:"en"').get_items()):
	if i>maxTweets:
	break
	#import libraries

	#core python packages for data analysis and arithmetics
	import pandas as pd
	import numpy as np
	pd.set_option('max_colwidth', None)

	#vizualization packages
	import seaborn as sns
	import matplotlib.pyplot as plt
	#Genre Distribution
	Genre_Distr = result.Genre_Name.value_counts()
	Genre_Distr

	#Visualisating Genre distribution
	sns.countplot(y = 'Genre_Name', data = result, palette='viridis',
	order = result['Genre_Name'].value_counts().index).set(title='Genre Distribution')
	#Splitting songs into 5 genres
	kmeans = KMeans(n_clusters = 5)
	label = kmeans.fit_predict(pca_scores)
	unique_labels = np.unique(label)

	#Plotting clusters
	for i in unique_labels:
	plt.scatter(pca_scores[label==i,0], pca_scores[label==i,1], label=i, s=80)

	plt.legend()
	#standardization
	scaler = StandardScaler()
	audio_features2_std = scaler.fit_transform(audio_features2)

	#fitting standardised data using PCA
	pca = PCA()
	pca.fit(audio_features2_std)

	#variance generated by each feature
	pca.explained_variance_ratio_