Instantly share code, notes, and snippets.
Created
July 17, 2019 21:08
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save ishwor2048/1094cdec5a6e864d9683411c5a8c8d51 to your computer and use it in GitHub Desktop.
This project is about predicting the usage of social media by different age category of the people based on various parameters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
# Importing new libraries into the terminal to run models | |
###################### | |
from sklearn.preprocessing import StandardScaler # standard scaler | |
from sklearn.decomposition import PCA # principal component analysis | |
############################ | |
# Importing the libraries required which is pretty much known | |
############################ | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
################################# | |
# Setting up pandas print options with max number of rows and coumns | |
################################# | |
pd.set_option('display.max_rows', 500) | |
pd.set_option('display.max_columns', 500) | |
###################### | |
# Importing dataset to the terminal | |
###################### | |
survey_df = pd.read_excel("finalExam_Mobile_App_Survey_Data.xlsx") | |
####################################### | |
#Renaming the columns of the dataset for easy understanding and study of the columns | |
####################################### | |
survey_df.rename(index=str, columns={"q1": "Age", "q2r1": "iPhone", | |
"q2r2": "iPod", "q2r3": "Android", | |
"q2r4": "BlackBerry", "q2r5": "Nokia", | |
"q2r6": "Windows", "q2r7": "HP", | |
"q2r8": "Tablet", "q2r9": "Others_Phone", | |
"q2r10": "No_Phone", "q4r1": "Music_&_Sound", | |
"q4r2": "TV_Check-In", "q4r3": "Entertainment", | |
"q4r4": "TV_Show", "q4r5": "Gaming", | |
"q4r6": "Social_Network", "q4r7": "General_News", | |
"q4r8": "Shopping", "q4r9": "Sp_Pub_News", | |
"q4r10": "Other_Apps", "q4r11": "No_Apps", | |
"q11": "No_of_Apps", "q12": "%age_free_download", | |
"q13r1": "Facebook_Visit", "q13r2": "Twitter_Visit", | |
"q13r3": "MySpace_Visit", "q13r4": "Pandora_Visit", | |
"q13r5": "Vevo_Visit", "q13r6": "YouTube_Visit", | |
"q13r7": "AOL_Radio_Visit", "q13r8": "Last_FM_Visit", | |
"q13r9": "Yahoo_Ent_Visit", "q13r10": "IMDB_Visit", | |
"q13r11": "LinkedIn_Visit", "q13r12": "NetFlix_Visit", | |
"q24r1": "Keep_Tech_Dev_Up", "q24r2": "People_Ask_Advice", | |
"q24r3": "Purchase_New_Gadgets", "q24r4": "Too_Much_Tech", | |
"q24r5": "Enjoy_Tech", "q24r6": "Apps_Web_Save_Time", | |
"q24r7": "Music_Imp", "q24r8": "TV_Show_Learn_More", | |
"q24r9": "Too_Much_Info_Online", "q24r10": "Check_Friends_On_Facebook", | |
"q24r11": "Internet_Useful", "q24r12": "Internet_Friends_Family", | |
"q25r1": "Myself_Opinion_Leader", "q25r2": "Stand_Out", | |
"q25r3": "Offer_Advice", "q25r4": "Decision_Lead", | |
"q25r5": "First_To_Try_New", "q25r6": "Responsibility_Overloaded", | |
"q25r7": "Like_Control", "q25r8": "Risk_Taker", | |
"q25r9": "Creative", "q25r10": "Optimistic", | |
"q25r11": "Very_Active", "q25r12": "Stretched", | |
"q26r18": "Luxury_Brands", "q26r3": "Bargain", | |
"q26r4": "Any_Shopping", "q26r5": "Package_Plan", | |
"q26r6": "Online_Shopping", "q26r7": "Designer_Brands", | |
"q26r8": "No_Enough_Apps", "q26r9": "Cool_Apps", | |
"q26r10": "Show_off_New_Apps", "q26r11": "Children_Impacted_App", | |
"q26r12": "Spend_More_for_Apps", "q26r13": "Earn_Spend_More", | |
"q26r14": "Whats_Hot_Whats_Not", "q26r15": "Brands_Reflect_Style", | |
"q26r16": "Impulse_Purchase", "q26r17": "Mobile_Strong_Entertainment", | |
"q48": "Schooling_Level", "q49": "Relationship_Status", | |
"q50r1": "No_Children", "q50r2": "Child_U-6", | |
"q50r3": "Child_6-12", "q50r4": "Child_13-17", | |
"q50r5": "Child_18_Plus", "q54": "Race", | |
"q55": "Hispanic/Latino", "q56": "Income_Level", | |
"q57": "Sex"}, inplace=True) | |
############################ | |
#Saving File to the local drive to run properly | |
############################ | |
survey_df.to_excel('Exam_File_After_Rename_Columns.xlsx') | |
################################# | |
#Retaking the column-renamed file to the terminal to run effectively | |
################################# | |
survey_df = pd.read_excel("Exam_File_After_Rename_Columns.xlsx") | |
############################################################################## | |
#Exploration about the information of the dataset | |
############################################################################## | |
#Exploring the information about DataFrame | |
survey_df.info() | |
## Creating histograms for each variable to glance through quickly on data | |
survey_df.hist(bins=50, figsize=(20,15)) | |
plt.show() | |
# Check for NANs values | |
nan = survey_df.isna().sum() | |
nan[nan > 0] | |
# Check for Missing values | |
print( | |
survey_df.columns | |
.isnull() | |
.sum() | |
) | |
#Awesome, there is no missing values into the dataset | |
############################################################################### | |
############################################################################### | |
# Principal Component Analysis (PCA) | |
############################################################################### | |
############################################################################### | |
#First checking correlation through hot correlation map | |
plt.imshow(survey_df.corr(), cmap='hot',interpolation='nearest') | |
######################## | |
#Remove demographic information from the DataFrame | |
######################## | |
survey_features = survey_df.drop(['Age', 'Race', 'Hispanic/Latino', | |
'Income_Level', 'Sex'], axis=1) | |
######################## | |
#Time to get the variance equally by scalling variance | |
######################## | |
scaler = StandardScaler() | |
scaler.fit(survey_features) | |
X_scaled_reduced = scaler.transform(survey_features) | |
######################## | |
#Running PCA Model without limiting the number of components | |
######################## | |
survey_pca_reduced = PCA(n_components = None, | |
random_state = 508) | |
survey_pca_reduced.fit(X_scaled_reduced) | |
X_pca_reduced = survey_pca_reduced.transform(X_scaled_reduced) | |
######################## | |
#Analyzing through scree plot to figure out how many components to retain | |
######################## | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
features = range(survey_pca_reduced.n_components_) | |
plt.plot(features, | |
survey_pca_reduced.explained_variance_ratio_, | |
linewidth = 2, | |
marker = 'o', | |
markersize = 10, | |
markeredgecolor = 'black', | |
markerfacecolor = 'grey') | |
plt.title('Reduced Survey Information Plots') | |
plt.xlabel('PCA feature') | |
plt.ylabel('Explained Variance') | |
plt.xticks(features) | |
plt.show() | |
""" | |
After plotting the scree plot, I can see the distinct 3 clusters which is the | |
best practice to take 3 clusters for the upcoming PCA analysis. | |
""" | |
######################## | |
#Now time to Run PCA again based on the desired number of components (3) | |
######################## | |
survey_pca_reduced = PCA(n_components = 3, | |
random_state = 508) | |
survey_pca_reduced.fit(X_scaled_reduced) | |
######################## | |
#Analyzing factor loadings to understand principal components | |
######################## | |
factor_loadings_df = pd.DataFrame(pd.np.transpose(survey_pca_reduced.components_)) | |
factor_loadings_df = factor_loadings_df.set_index(survey_features.columns[:]) | |
print(factor_loadings_df) | |
factor_loadings_df.to_excel('survey_factor_loadings.xlsx') | |
######################## | |
#Analyze factor strengths per surveyer | |
######################## | |
X_pca_reduced = survey_pca_reduced.transform(X_scaled_reduced) | |
X_pca_df = pd.DataFrame(X_pca_reduced) | |
######################## | |
#Rename your principal components and reattach demographic information | |
######################## | |
X_pca_df.columns = ['Device_&_Networking', 'Social_Media', 'Lifestyle'] | |
final_pca_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino', | |
'Income_Level', 'Sex']] , X_pca_df], axis = 1) | |
######################## | |
#Analyze in more detail | |
######################## | |
# Renaming Sex | |
Gender = {1 : 'Male', | |
2 : 'Female'} | |
final_pca_df['Sex'].replace(Gender, inplace = True) | |
# Renaming Hispanic/Latino | |
HisLat = {1 : 'Yes', | |
2 : 'No'} | |
final_pca_df['Hispanic/Latino'].replace(HisLat, inplace = True) | |
# Renaming Race | |
Race = {1 : 'White', | |
2 : 'Black', | |
3 : 'Asian', | |
6 : 'Other'} | |
final_pca_df['Race'].replace(Race, inplace = True) | |
# Renaming Income Level | |
Income = {1 : 'U-10K', | |
2 : '10K-15K', | |
3 : '15K-20K', | |
4 : '20K-30K', | |
5 : '30K-40K', | |
6 : '40K-50K', | |
7 : '50K-60K', | |
8 : '60K-70K', | |
9 : '70K-80K', | |
10 : '80K-90K', | |
11 : '90K-100K', | |
12 : '100K-125K', | |
13 : '125K-150K', | |
14 : '150K +'} | |
final_pca_df['Income_Level'].replace(Income, inplace = True) | |
# Renaming Age | |
Age = {1 : 'U-18', | |
2 : '18-24', | |
3 : '25-29', | |
4 : '30-34', | |
5 : '35-39', | |
6 : '40-44', | |
7 : '45-49', | |
8 : '50-54', | |
9 : '55-59', | |
10 : '60-64', | |
11 : '65+'} | |
final_pca_df['Age'].replace(Age, inplace = True) | |
########################################################################### | |
# Analyzing by Income Level | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Income_Level', | |
y = 'Social_Media', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Income Level & Social Media Relationship.png') | |
################################################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Income_Level', | |
y = 'Lifestyle', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Income Level and Lifestyle.png') | |
################################################# | |
fig, ax = plt.subplots(figsize = (10, 5)) | |
sns.boxplot(x = 'Income_Level', | |
y = 'Device_&_Networking', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Income Level & Device_&_Networking.png') | |
####################################################### | |
#Analyzing by Sex | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Sex', | |
y = 'Social_Media', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Sex and Social Media Relationship.png') | |
##################################################### | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Sex', | |
y = 'Lifestyle', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Sex and Lifestyle.png') | |
##################################################### | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Sex', | |
y = 'Device_&_Networking', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Sex and Device & Networking.png') | |
###################################################### | |
#Analyzing by Race | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Race', | |
y = 'Social_Media', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Race & Social Media.png') | |
################################################ | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Race', | |
y = 'Lifestyle', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Race and Lifestyle Comparison.png') | |
################################################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Race', | |
y = 'Device_&_Networking', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Race and Device & Networking Relationship.png') | |
######################################################## | |
#Analyzing by Age | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Age', | |
y = 'Social_Media', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Age and Social Media Relationship.png') | |
################################################ | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Age', | |
y = 'Lifestyle', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Age & Lifestyle Correlation.png') | |
################################################### | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Age', | |
y = 'Device_&_Networking', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Age and Device & Networking Connection.png') | |
##################################################### | |
#Analyzing by Hispanic/Latino | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Hispanic/Latino', | |
y = 'Social_Media', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
################################################ | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Hispanic/Latino', | |
y = 'Lifestyle', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Hispanic Latino Relation with lifestyle.png') | |
################################################### | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Hispanic/Latino', | |
y = 'Device_&_Networking', | |
data = final_pca_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Hispanic Latino Relation with Device & Networking.png') | |
############################################################################### | |
############################################################################### | |
# Cluster Analysis One More Time!!! | |
############################################################################### | |
############################################################################### | |
######################## | |
#Importing the required package KMeans | |
######################## | |
from sklearn.cluster import KMeans # k-means clustering | |
######################## | |
#Remove demographic information | |
######################## | |
survey_features_reduced = survey_df.drop(['Age', 'Race', 'Hispanic/Latino', | |
'Income_Level', 'Sex'], axis=1) | |
######################## | |
#Scale the dataframe to get the equal variance for the analysis | |
######################## | |
scaler = StandardScaler() | |
scaler.fit(survey_features_reduced) | |
X_scaled_reduced = scaler.transform(survey_features_reduced) | |
######################## | |
#Let's try with the different number of clusters (5) than what I did before | |
######################## | |
survey_k = KMeans(n_clusters = 5, | |
random_state = 508) | |
survey_k.fit(X_scaled_reduced) | |
survey_kmeans_clusters = pd.DataFrame({'cluster': survey_k.labels_}) | |
print(survey_kmeans_clusters.iloc[: , 0].value_counts()) | |
######################## | |
#Let's figure out the centers of the clusters | |
######################## | |
centroids = survey_k.cluster_centers_ | |
centroids_df = pd.DataFrame(centroids) | |
######################### | |
# Now time to rename the clusters to be identified easily and good for analysis | |
######################### | |
centroids_df.columns = survey_features_reduced.columns | |
print(centroids_df) | |
######################### | |
# Time to send the data to the excel sheet so its safe to run again if needed | |
######################### | |
centroids_df.to_excel('survey_k3_centriods.xlsx') | |
############################### | |
#Analyze cluster memberships | |
############################### | |
X_scaled_reduced_df = pd.DataFrame(X_scaled_reduced) | |
X_scaled_reduced_df.columns = survey_features_reduced.columns | |
#Joining both the cluster information analysis data | |
clusters_df = pd.concat([survey_kmeans_clusters, | |
X_scaled_reduced_df], | |
axis = 1) | |
print(clusters_df) | |
clusters_df.to_excel('clusters_DataFrame.xlsx') | |
########################################## | |
#Now let's Reattach demographic information and run the algorythm | |
########################################## | |
final_clusters_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino', | |
'Income_Level', 'Sex']] , clusters_df], axis = 1) | |
print(final_clusters_df) | |
################################## | |
#Time to Analyze in more detail so that we know exactly what's going on | |
################################## | |
# Renaming Sex | |
Gender = {1 : 'Male', | |
2 : 'Female'} | |
final_clusters_df['Sex'].replace(Gender, inplace = True) | |
########################################################## | |
# Renaming Hispanic/Latino | |
HisLat = {1 : 'Yes', | |
2 : 'No'} | |
final_clusters_df['Hispanic/Latino'].replace(HisLat, inplace = True) | |
################################################################ | |
# Renaming Race | |
Race = {1 : 'White', | |
2 : 'Black', | |
3 : 'Asian', | |
6 : 'Other'} | |
final_clusters_df['Race'].replace(Race, inplace = True) | |
############################################################ | |
# Renaming Income Level | |
Income = {1 : 'U-10K', | |
2 : '10K-15K', | |
3 : '15K-20K', | |
4 : '20K-30K', | |
5 : '30K-40K', | |
6 : '40K-50K', | |
7 : '50K-60K', | |
8 : '60K-70K', | |
9 : '70K-80K', | |
10 : '80K-90K', | |
11 : '90K-100K', | |
12 : '100K-125K', | |
13 : '125K-150K', | |
14 : '150K +'} | |
final_clusters_df['Income_Level'].replace(Income, inplace = True) | |
# Renaming Age | |
Age = {1 : 'U-18', | |
2 : '18-24', | |
3 : '25-29', | |
4 : '30-34', | |
5 : '35-39', | |
6 : '40-44', | |
7 : '45-49', | |
8 : '50-54', | |
9 : '55-59', | |
10 : '60-64', | |
11 : '65+'} | |
final_clusters_df['Age'].replace(Age, inplace = True) | |
######################## | |
# Analyzing by AGE by plotting sns boxplots | |
######################## | |
# Age | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Age', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = final_clusters_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Age & Entertainment Connection.png') | |
######################## | |
# Analyzing by Sex in same type of plots | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Sex', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = final_clusters_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Sex & Entertainment Connection.png') | |
########################## | |
# Analyzing by Income Level | |
########################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Income_Level', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = final_clusters_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Income level & Entertainment Connection.png') | |
######################## | |
# Analyzing by Race | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Race', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = final_clusters_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Race & Entertainment Connection.png') | |
######################## | |
# Analyzing by Hispanic/Latino | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Hispanic/Latino', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = final_clusters_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
plt.savefig('Hispanic Latino & Entertainment Connection.png') | |
#Analysis looks interesting though | |
############################################################################### | |
############################################################################### | |
#Model Code | |
############################################################################### | |
############################################################################### | |
""" | |
Now, its time to combine both the models together and see how interesting | |
research goes while building up the model | |
Data has been considered imported on above and running on the same imported | |
data of before, and running just the model below""" | |
######################## | |
#Taking transformed dataframe | |
######################## | |
print(X_pca_df.head(n = 5)) | |
print(pd.np.var(X_pca_df)) | |
######################## | |
#Now, as before, scale the dataframe to get equal range of data | |
######################## | |
scaler = StandardScaler() | |
scaler.fit(X_pca_df) | |
X_pca_clust = scaler.transform(X_pca_df) | |
X_pca_clust_df = pd.DataFrame(X_pca_clust) | |
print(pd.np.var(X_pca_clust_df)) | |
X_pca_clust_df.columns = X_pca_df.columns | |
######################## | |
#Experimenting with different numbers of clusters that is 5 clusters | |
######################## | |
survey_k_pca = KMeans(n_clusters = 5, | |
random_state = 508) | |
survey_k_pca.fit(X_pca_clust_df) | |
survey_kmeans_pca = pd.DataFrame({'cluster': survey_k_pca.labels_}) | |
print(survey_kmeans_pca.iloc[: , 0].value_counts()) | |
######################## | |
#Now its time to Analyze cluster centers | |
######################## | |
centroids_pca = survey_k_pca.cluster_centers_ | |
centroids_pca_df = pd.DataFrame(centroids_pca) | |
# Renaming principal components | |
centroids_pca_df.columns = ['Device_&_Networking', 'Social_Media', 'Lifestyle'] | |
print(centroids_pca_df) | |
################################################################ | |
################################################################ | |
# Sending data to Excel (Unsupervised Learning Algorythm results) | |
################################################################ | |
################################################################ | |
centroids_pca_df.to_excel('survey_pca_centriods.xlsx') | |
""" | |
Analyis part below is just analyze the survey more in dtails for understanding | |
what the data wants to say while going into more deep dive | |
""" | |
######################## | |
#now more just to Analyze cluster memberships | |
######################## | |
clst_pca_df = pd.concat([survey_kmeans_pca, | |
X_pca_clust_df], | |
axis = 1) | |
print(clst_pca_df) | |
######################## | |
#Reattach demographic information | |
######################## | |
final_pca_clusters_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino', | |
'Income_Level', 'Sex']] , clusters_df], axis = 1) | |
print(final_pca_clusters_df.head(n = 5)) | |
############################################################################### | |
#Analyze in more detail | |
############################################################################### | |
############## | |
# Renaming Sex | |
############## | |
Gender = {1 : 'Male', | |
2 : 'Female'} | |
final_pca_clusters_df['Sex'].replace(Gender, inplace = True) | |
########################## | |
# Renaming Hispanic/Latino | |
########################## | |
HisLat = {1 : 'Yes', | |
2 : 'No'} | |
final_pca_clusters_df['Hispanic/Latino'].replace(HisLat, inplace = True) | |
############### | |
# Renaming Race | |
############### | |
Race = {1 : 'White', | |
2 : 'Black', | |
3 : 'Asian', | |
6 : 'Other'} | |
final_pca_clusters_df['Race'].replace(Race, inplace = True) | |
####################### | |
# Renaming Income Level | |
####################### | |
Income = {1 : 'U-10K', | |
2 : '10K-15K', | |
3 : '15K-20K', | |
4 : '20K-30K', | |
5 : '30K-40K', | |
6 : '40K-50K', | |
7 : '50K-60K', | |
8 : '60K-70K', | |
9 : '70K-80K', | |
10 : '80K-90K', | |
11 : '90K-100K', | |
12 : '100K-125K', | |
13 : '125K-150K', | |
14 : '150K +'} | |
final_pca_clusters_df['Income_Level'].replace(Income, inplace = True) | |
############## | |
# Renaming Age | |
############## | |
Age = {1 : 'U-18', | |
2 : '18-24', | |
3 : '25-29', | |
4 : '30-34', | |
5 : '35-39', | |
6 : '40-44', | |
7 : '45-49', | |
8 : '50-54', | |
9 : '55-59', | |
10 : '60-64', | |
11 : '65+'} | |
final_pca_clusters_df['Age'].replace(Age, inplace = True) | |
# Adding a productivity step | |
data_df = final_pca_clusters_df | |
######################## | |
# Analyzing by Age of the population through sns boxplot | |
######################## | |
# Age | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Age', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = data_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
######################## | |
# Analyzing by Sex (male/female) through sns boxplot | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Sex', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = data_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
########################## | |
# Analyzing by Income Level of the population surveyed through sns boxplot | |
########################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Income_Level', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = data_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
######################## | |
# Analyzing by Race of the people from the survey through sns boxplot | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Race', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = data_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() | |
######################## | |
# Analyzing by Hispanic/Latino category of surveyers through sns boxplot | |
######################## | |
fig, ax = plt.subplots(figsize = (8, 4)) | |
sns.boxplot(x = 'Hispanic/Latino', | |
y = 'Entertainment', | |
hue = 'cluster', | |
data = data_df) | |
plt.ylim(-2, 4) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment