Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ishwor2048/1094cdec5a6e864d9683411c5a8c8d51 to your computer and use it in GitHub Desktop.
Save ishwor2048/1094cdec5a6e864d9683411c5a8c8d51 to your computer and use it in GitHub Desktop.
This project is about predicting the usage of social media by different age category of the people based on various parameters.
######################
# Importing new libraries into the terminal to run models
######################
from sklearn.preprocessing import StandardScaler # standard scaler
from sklearn.decomposition import PCA # principal component analysis
############################
# Importing the libraries required which is pretty much known
############################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#################################
# Setting up pandas print options with max number of rows and coumns
#################################
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
######################
# Importing dataset to the terminal
######################
survey_df = pd.read_excel("finalExam_Mobile_App_Survey_Data.xlsx")
#######################################
#Renaming the columns of the dataset for easy understanding and study of the columns
#######################################
survey_df.rename(index=str, columns={"q1": "Age", "q2r1": "iPhone",
"q2r2": "iPod", "q2r3": "Android",
"q2r4": "BlackBerry", "q2r5": "Nokia",
"q2r6": "Windows", "q2r7": "HP",
"q2r8": "Tablet", "q2r9": "Others_Phone",
"q2r10": "No_Phone", "q4r1": "Music_&_Sound",
"q4r2": "TV_Check-In", "q4r3": "Entertainment",
"q4r4": "TV_Show", "q4r5": "Gaming",
"q4r6": "Social_Network", "q4r7": "General_News",
"q4r8": "Shopping", "q4r9": "Sp_Pub_News",
"q4r10": "Other_Apps", "q4r11": "No_Apps",
"q11": "No_of_Apps", "q12": "%age_free_download",
"q13r1": "Facebook_Visit", "q13r2": "Twitter_Visit",
"q13r3": "MySpace_Visit", "q13r4": "Pandora_Visit",
"q13r5": "Vevo_Visit", "q13r6": "YouTube_Visit",
"q13r7": "AOL_Radio_Visit", "q13r8": "Last_FM_Visit",
"q13r9": "Yahoo_Ent_Visit", "q13r10": "IMDB_Visit",
"q13r11": "LinkedIn_Visit", "q13r12": "NetFlix_Visit",
"q24r1": "Keep_Tech_Dev_Up", "q24r2": "People_Ask_Advice",
"q24r3": "Purchase_New_Gadgets", "q24r4": "Too_Much_Tech",
"q24r5": "Enjoy_Tech", "q24r6": "Apps_Web_Save_Time",
"q24r7": "Music_Imp", "q24r8": "TV_Show_Learn_More",
"q24r9": "Too_Much_Info_Online", "q24r10": "Check_Friends_On_Facebook",
"q24r11": "Internet_Useful", "q24r12": "Internet_Friends_Family",
"q25r1": "Myself_Opinion_Leader", "q25r2": "Stand_Out",
"q25r3": "Offer_Advice", "q25r4": "Decision_Lead",
"q25r5": "First_To_Try_New", "q25r6": "Responsibility_Overloaded",
"q25r7": "Like_Control", "q25r8": "Risk_Taker",
"q25r9": "Creative", "q25r10": "Optimistic",
"q25r11": "Very_Active", "q25r12": "Stretched",
"q26r18": "Luxury_Brands", "q26r3": "Bargain",
"q26r4": "Any_Shopping", "q26r5": "Package_Plan",
"q26r6": "Online_Shopping", "q26r7": "Designer_Brands",
"q26r8": "No_Enough_Apps", "q26r9": "Cool_Apps",
"q26r10": "Show_off_New_Apps", "q26r11": "Children_Impacted_App",
"q26r12": "Spend_More_for_Apps", "q26r13": "Earn_Spend_More",
"q26r14": "Whats_Hot_Whats_Not", "q26r15": "Brands_Reflect_Style",
"q26r16": "Impulse_Purchase", "q26r17": "Mobile_Strong_Entertainment",
"q48": "Schooling_Level", "q49": "Relationship_Status",
"q50r1": "No_Children", "q50r2": "Child_U-6",
"q50r3": "Child_6-12", "q50r4": "Child_13-17",
"q50r5": "Child_18_Plus", "q54": "Race",
"q55": "Hispanic/Latino", "q56": "Income_Level",
"q57": "Sex"}, inplace=True)
############################
#Saving File to the local drive to run properly
############################
survey_df.to_excel('Exam_File_After_Rename_Columns.xlsx')
#################################
#Retaking the column-renamed file to the terminal to run effectively
#################################
survey_df = pd.read_excel("Exam_File_After_Rename_Columns.xlsx")
##############################################################################
#Exploration about the information of the dataset
##############################################################################
#Exploring the information about DataFrame
survey_df.info()
## Creating histograms for each variable to glance through quickly on data
survey_df.hist(bins=50, figsize=(20,15))
plt.show()
# Check for NANs values
nan = survey_df.isna().sum()
nan[nan > 0]
# Check for Missing values
print(
survey_df.columns
.isnull()
.sum()
)
#Awesome, there is no missing values into the dataset
###############################################################################
###############################################################################
# Principal Component Analysis (PCA)
###############################################################################
###############################################################################
#First checking correlation through hot correlation map
plt.imshow(survey_df.corr(), cmap='hot',interpolation='nearest')
########################
#Remove demographic information from the DataFrame
########################
survey_features = survey_df.drop(['Age', 'Race', 'Hispanic/Latino',
'Income_Level', 'Sex'], axis=1)
########################
#Time to get the variance equally by scalling variance
########################
scaler = StandardScaler()
scaler.fit(survey_features)
X_scaled_reduced = scaler.transform(survey_features)
########################
#Running PCA Model without limiting the number of components
########################
survey_pca_reduced = PCA(n_components = None,
random_state = 508)
survey_pca_reduced.fit(X_scaled_reduced)
X_pca_reduced = survey_pca_reduced.transform(X_scaled_reduced)
########################
#Analyzing through scree plot to figure out how many components to retain
########################
fig, ax = plt.subplots(figsize=(10, 8))
features = range(survey_pca_reduced.n_components_)
plt.plot(features,
survey_pca_reduced.explained_variance_ratio_,
linewidth = 2,
marker = 'o',
markersize = 10,
markeredgecolor = 'black',
markerfacecolor = 'grey')
plt.title('Reduced Survey Information Plots')
plt.xlabel('PCA feature')
plt.ylabel('Explained Variance')
plt.xticks(features)
plt.show()
"""
After plotting the scree plot, I can see the distinct 3 clusters which is the
best practice to take 3 clusters for the upcoming PCA analysis.
"""
########################
#Now time to Run PCA again based on the desired number of components (3)
########################
survey_pca_reduced = PCA(n_components = 3,
random_state = 508)
survey_pca_reduced.fit(X_scaled_reduced)
########################
#Analyzing factor loadings to understand principal components
########################
factor_loadings_df = pd.DataFrame(pd.np.transpose(survey_pca_reduced.components_))
factor_loadings_df = factor_loadings_df.set_index(survey_features.columns[:])
print(factor_loadings_df)
factor_loadings_df.to_excel('survey_factor_loadings.xlsx')
########################
#Analyze factor strengths per surveyer
########################
X_pca_reduced = survey_pca_reduced.transform(X_scaled_reduced)
X_pca_df = pd.DataFrame(X_pca_reduced)
########################
#Rename your principal components and reattach demographic information
########################
X_pca_df.columns = ['Device_&_Networking', 'Social_Media', 'Lifestyle']
final_pca_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino',
'Income_Level', 'Sex']] , X_pca_df], axis = 1)
########################
#Analyze in more detail
########################
# Renaming Sex
Gender = {1 : 'Male',
2 : 'Female'}
final_pca_df['Sex'].replace(Gender, inplace = True)
# Renaming Hispanic/Latino
HisLat = {1 : 'Yes',
2 : 'No'}
final_pca_df['Hispanic/Latino'].replace(HisLat, inplace = True)
# Renaming Race
Race = {1 : 'White',
2 : 'Black',
3 : 'Asian',
6 : 'Other'}
final_pca_df['Race'].replace(Race, inplace = True)
# Renaming Income Level
Income = {1 : 'U-10K',
2 : '10K-15K',
3 : '15K-20K',
4 : '20K-30K',
5 : '30K-40K',
6 : '40K-50K',
7 : '50K-60K',
8 : '60K-70K',
9 : '70K-80K',
10 : '80K-90K',
11 : '90K-100K',
12 : '100K-125K',
13 : '125K-150K',
14 : '150K +'}
final_pca_df['Income_Level'].replace(Income, inplace = True)
# Renaming Age
Age = {1 : 'U-18',
2 : '18-24',
3 : '25-29',
4 : '30-34',
5 : '35-39',
6 : '40-44',
7 : '45-49',
8 : '50-54',
9 : '55-59',
10 : '60-64',
11 : '65+'}
final_pca_df['Age'].replace(Age, inplace = True)
###########################################################################
# Analyzing by Income Level
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Income_Level',
y = 'Social_Media',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Income Level & Social Media Relationship.png')
##################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Income_Level',
y = 'Lifestyle',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Income Level and Lifestyle.png')
#################################################
fig, ax = plt.subplots(figsize = (10, 5))
sns.boxplot(x = 'Income_Level',
y = 'Device_&_Networking',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Income Level & Device_&_Networking.png')
#######################################################
#Analyzing by Sex
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Sex',
y = 'Social_Media',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Sex and Social Media Relationship.png')
#####################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Sex',
y = 'Lifestyle',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Sex and Lifestyle.png')
#####################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Sex',
y = 'Device_&_Networking',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Sex and Device & Networking.png')
######################################################
#Analyzing by Race
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Race',
y = 'Social_Media',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Race & Social Media.png')
################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Race',
y = 'Lifestyle',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Race and Lifestyle Comparison.png')
##################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Race',
y = 'Device_&_Networking',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Race and Device & Networking Relationship.png')
########################################################
#Analyzing by Age
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Age',
y = 'Social_Media',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Age and Social Media Relationship.png')
################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Age',
y = 'Lifestyle',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Age & Lifestyle Correlation.png')
###################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Age',
y = 'Device_&_Networking',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Age and Device & Networking Connection.png')
#####################################################
#Analyzing by Hispanic/Latino
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Hispanic/Latino',
y = 'Social_Media',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Hispanic/Latino',
y = 'Lifestyle',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Hispanic Latino Relation with lifestyle.png')
###################################################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Hispanic/Latino',
y = 'Device_&_Networking',
data = final_pca_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Hispanic Latino Relation with Device & Networking.png')
###############################################################################
###############################################################################
# Cluster Analysis One More Time!!!
###############################################################################
###############################################################################
########################
#Importing the required package KMeans
########################
from sklearn.cluster import KMeans # k-means clustering
########################
#Remove demographic information
########################
survey_features_reduced = survey_df.drop(['Age', 'Race', 'Hispanic/Latino',
'Income_Level', 'Sex'], axis=1)
########################
#Scale the dataframe to get the equal variance for the analysis
########################
scaler = StandardScaler()
scaler.fit(survey_features_reduced)
X_scaled_reduced = scaler.transform(survey_features_reduced)
########################
#Let's try with the different number of clusters (5) than what I did before
########################
survey_k = KMeans(n_clusters = 5,
random_state = 508)
survey_k.fit(X_scaled_reduced)
survey_kmeans_clusters = pd.DataFrame({'cluster': survey_k.labels_})
print(survey_kmeans_clusters.iloc[: , 0].value_counts())
########################
#Let's figure out the centers of the clusters
########################
centroids = survey_k.cluster_centers_
centroids_df = pd.DataFrame(centroids)
#########################
# Now time to rename the clusters to be identified easily and good for analysis
#########################
centroids_df.columns = survey_features_reduced.columns
print(centroids_df)
#########################
# Time to send the data to the excel sheet so its safe to run again if needed
#########################
centroids_df.to_excel('survey_k3_centriods.xlsx')
###############################
#Analyze cluster memberships
###############################
X_scaled_reduced_df = pd.DataFrame(X_scaled_reduced)
X_scaled_reduced_df.columns = survey_features_reduced.columns
#Joining both the cluster information analysis data
clusters_df = pd.concat([survey_kmeans_clusters,
X_scaled_reduced_df],
axis = 1)
print(clusters_df)
clusters_df.to_excel('clusters_DataFrame.xlsx')
##########################################
#Now let's Reattach demographic information and run the algorythm
##########################################
final_clusters_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino',
'Income_Level', 'Sex']] , clusters_df], axis = 1)
print(final_clusters_df)
##################################
#Time to Analyze in more detail so that we know exactly what's going on
##################################
# Renaming Sex
Gender = {1 : 'Male',
2 : 'Female'}
final_clusters_df['Sex'].replace(Gender, inplace = True)
##########################################################
# Renaming Hispanic/Latino
HisLat = {1 : 'Yes',
2 : 'No'}
final_clusters_df['Hispanic/Latino'].replace(HisLat, inplace = True)
################################################################
# Renaming Race
Race = {1 : 'White',
2 : 'Black',
3 : 'Asian',
6 : 'Other'}
final_clusters_df['Race'].replace(Race, inplace = True)
############################################################
# Renaming Income Level
Income = {1 : 'U-10K',
2 : '10K-15K',
3 : '15K-20K',
4 : '20K-30K',
5 : '30K-40K',
6 : '40K-50K',
7 : '50K-60K',
8 : '60K-70K',
9 : '70K-80K',
10 : '80K-90K',
11 : '90K-100K',
12 : '100K-125K',
13 : '125K-150K',
14 : '150K +'}
final_clusters_df['Income_Level'].replace(Income, inplace = True)
# Renaming Age
Age = {1 : 'U-18',
2 : '18-24',
3 : '25-29',
4 : '30-34',
5 : '35-39',
6 : '40-44',
7 : '45-49',
8 : '50-54',
9 : '55-59',
10 : '60-64',
11 : '65+'}
final_clusters_df['Age'].replace(Age, inplace = True)
########################
# Analyzing by AGE by plotting sns boxplots
########################
# Age
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Age',
y = 'Entertainment',
hue = 'cluster',
data = final_clusters_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Age & Entertainment Connection.png')
########################
# Analyzing by Sex in same type of plots
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Sex',
y = 'Entertainment',
hue = 'cluster',
data = final_clusters_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Sex & Entertainment Connection.png')
##########################
# Analyzing by Income Level
##########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Income_Level',
y = 'Entertainment',
hue = 'cluster',
data = final_clusters_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Income level & Entertainment Connection.png')
########################
# Analyzing by Race
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Race',
y = 'Entertainment',
hue = 'cluster',
data = final_clusters_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Race & Entertainment Connection.png')
########################
# Analyzing by Hispanic/Latino
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Hispanic/Latino',
y = 'Entertainment',
hue = 'cluster',
data = final_clusters_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
plt.savefig('Hispanic Latino & Entertainment Connection.png')
#Analysis looks interesting though
###############################################################################
###############################################################################
#Model Code
###############################################################################
###############################################################################
"""
Now, its time to combine both the models together and see how interesting
research goes while building up the model
Data has been considered imported on above and running on the same imported
data of before, and running just the model below"""
########################
#Taking transformed dataframe
########################
print(X_pca_df.head(n = 5))
print(pd.np.var(X_pca_df))
########################
#Now, as before, scale the dataframe to get equal range of data
########################
scaler = StandardScaler()
scaler.fit(X_pca_df)
X_pca_clust = scaler.transform(X_pca_df)
X_pca_clust_df = pd.DataFrame(X_pca_clust)
print(pd.np.var(X_pca_clust_df))
X_pca_clust_df.columns = X_pca_df.columns
########################
#Experimenting with different numbers of clusters that is 5 clusters
########################
survey_k_pca = KMeans(n_clusters = 5,
random_state = 508)
survey_k_pca.fit(X_pca_clust_df)
survey_kmeans_pca = pd.DataFrame({'cluster': survey_k_pca.labels_})
print(survey_kmeans_pca.iloc[: , 0].value_counts())
########################
#Now its time to Analyze cluster centers
########################
centroids_pca = survey_k_pca.cluster_centers_
centroids_pca_df = pd.DataFrame(centroids_pca)
# Renaming principal components
centroids_pca_df.columns = ['Device_&_Networking', 'Social_Media', 'Lifestyle']
print(centroids_pca_df)
################################################################
################################################################
# Sending data to Excel (Unsupervised Learning Algorythm results)
################################################################
################################################################
centroids_pca_df.to_excel('survey_pca_centriods.xlsx')
"""
Analyis part below is just analyze the survey more in dtails for understanding
what the data wants to say while going into more deep dive
"""
########################
#now more just to Analyze cluster memberships
########################
clst_pca_df = pd.concat([survey_kmeans_pca,
X_pca_clust_df],
axis = 1)
print(clst_pca_df)
########################
#Reattach demographic information
########################
final_pca_clusters_df = pd.concat([survey_df.loc[ : , ['Age', 'Race', 'Hispanic/Latino',
'Income_Level', 'Sex']] , clusters_df], axis = 1)
print(final_pca_clusters_df.head(n = 5))
###############################################################################
#Analyze in more detail
###############################################################################
##############
# Renaming Sex
##############
Gender = {1 : 'Male',
2 : 'Female'}
final_pca_clusters_df['Sex'].replace(Gender, inplace = True)
##########################
# Renaming Hispanic/Latino
##########################
HisLat = {1 : 'Yes',
2 : 'No'}
final_pca_clusters_df['Hispanic/Latino'].replace(HisLat, inplace = True)
###############
# Renaming Race
###############
Race = {1 : 'White',
2 : 'Black',
3 : 'Asian',
6 : 'Other'}
final_pca_clusters_df['Race'].replace(Race, inplace = True)
#######################
# Renaming Income Level
#######################
Income = {1 : 'U-10K',
2 : '10K-15K',
3 : '15K-20K',
4 : '20K-30K',
5 : '30K-40K',
6 : '40K-50K',
7 : '50K-60K',
8 : '60K-70K',
9 : '70K-80K',
10 : '80K-90K',
11 : '90K-100K',
12 : '100K-125K',
13 : '125K-150K',
14 : '150K +'}
final_pca_clusters_df['Income_Level'].replace(Income, inplace = True)
##############
# Renaming Age
##############
Age = {1 : 'U-18',
2 : '18-24',
3 : '25-29',
4 : '30-34',
5 : '35-39',
6 : '40-44',
7 : '45-49',
8 : '50-54',
9 : '55-59',
10 : '60-64',
11 : '65+'}
final_pca_clusters_df['Age'].replace(Age, inplace = True)
# Adding a productivity step
data_df = final_pca_clusters_df
########################
# Analyzing by Age of the population through sns boxplot
########################
# Age
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Age',
y = 'Entertainment',
hue = 'cluster',
data = data_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
########################
# Analyzing by Sex (male/female) through sns boxplot
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Sex',
y = 'Entertainment',
hue = 'cluster',
data = data_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
##########################
# Analyzing by Income Level of the population surveyed through sns boxplot
##########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Income_Level',
y = 'Entertainment',
hue = 'cluster',
data = data_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
########################
# Analyzing by Race of the people from the survey through sns boxplot
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Race',
y = 'Entertainment',
hue = 'cluster',
data = data_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
########################
# Analyzing by Hispanic/Latino category of surveyers through sns boxplot
########################
fig, ax = plt.subplots(figsize = (8, 4))
sns.boxplot(x = 'Hispanic/Latino',
y = 'Entertainment',
hue = 'cluster',
data = data_df)
plt.ylim(-2, 4)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment