Created
November 19, 2019 21:58
-
-
Save TheBubblePopped/9b6a75ea34bc20668a1e5d53103e755c to your computer and use it in GitHub Desktop.
visdat
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from sklearn.cluster import KMeans | |
from sklearn.datasets import make_blobs | |
import pandas as pd | |
from sklearn import preprocessing as p | |
import plotly.graph_objects as go | |
pd.set_option('display.max_columns', 80) | |
pd.set_option('display.max_rows', 300) | |
pd.set_option('display.width', 1000) | |
from matplotlib import colors | |
colors = {0:'#1b9e77', 1:'#d95f02', 2:'#7570b3', 3:'#e7298a', 4:'#66a61e', 5:'#e6ab02', 6:'#a6761d', 7:'#666666'} | |
df = pd.read_excel('USDA_Food_Database.xlsx') | |
temp = df.copy().dropna() #earlycope to not remove keword and name colmn | |
temp2 = df.copy().dropna() | |
df.drop(df[['No', 'Name', 'Keyword']], axis=1, inplace=True) | |
cols = df.columns | |
df.dropna(inplace=True) | |
puh2=p.MinMaxScaler().fit_transform(df) # numpy.Narray columnsnames get lost flatten out | |
newdf2 = pd.DataFrame(puh2) #transform back into df | |
newdf2.columns = cols #give columns names | |
newdf2 = newdf2.loc[:,['Water_(g)', 'Sugar_Tot_(g)']].dropna() #locate only 2 features we want to cluister | |
def fit_kmeans(data, centers): | |
kmeans = KMeans(centers) | |
labels = kmeans.fit_predict(data) | |
return labels | |
def make_charts(num, tempfunc=temp.copy()): | |
"""makes first scatterplot""" | |
labels = fit_kmeans(newdf2, num) | |
tempfunc = tempfunc.loc[:, ['Keyword', 'Name', 'Water_(g)', 'Sugar_Tot_(g)']].dropna() | |
tempfunc['Label'] = labels | |
plt.title('Food Dataset K-Means Clustering') | |
plt.ylabel('Sugar(g)') | |
plt.xlabel('Water(g)') | |
for g in np.unique(labels): | |
ix = np.where(labels == g) | |
ix = ix[0] | |
plt.scatter(tempfunc.iloc[ix]['Water_(g)'], tempfunc.iloc[ix]['Sugar_Tot_(g)'], c=colors[g], | |
label="Group " + str(g + 1), | |
alpha=0.3) | |
plt.legend() | |
plt.show() | |
"""creates the fig for the food type""" | |
if num < 4: | |
myplot = plt.subplots(num, 1, figsize=(14, 16))[1] | |
for x in range(0, num): | |
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count() | |
counter = yo['Label'].sort_values().tail(10) | |
counter = (counter / counter.sum()) * 100 | |
myplot[x].barh(counter.index, counter, color=colors[x]) | |
text_kwargs = dict(fontsize=26) | |
myplot[x].set_title(f"Group {x + 1} in %", **text_kwargs) | |
myplot[x].tick_params(axis="x", labelsize=30) | |
myplot[x].tick_params(axis="y", labelsize=20) | |
plt.show() | |
elif num == 4: | |
myplot = plt.subplots(2, 2, figsize=(24, 14))[1] | |
myplot2 = np.append(myplot[0], myplot[1]) | |
for x in range(0, num): | |
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count() | |
counter = yo['Label'].sort_values().tail(10) | |
counter = (counter / counter.sum()) * 100 | |
myplot2[x].barh(counter.index, counter, color=colors[x]) | |
text_kwargs = dict(fontsize=26) | |
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs) | |
myplot2[x].tick_params(axis="x", labelsize=30) | |
myplot2[x].tick_params(axis="y", labelsize=20) | |
plt.show() | |
elif num > 4 and num < 7: | |
myplot = plt.subplots(3, 2, figsize=(24, 14))[1] | |
myplot2 = [x for sublot in myplot for x in sublot] | |
for x in range(0, num): | |
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count() | |
counter = yo['Label'].sort_values().tail(10) | |
counter = (counter / counter.sum()) * 100 | |
myplot2[x].barh(counter.index, counter, color=colors[x]) | |
text_kwargs = dict(fontsize=26) | |
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs) | |
myplot2[x].tick_params(axis="x", labelsize=24) | |
myplot2[x].tick_params(axis="y", labelsize=18) | |
plt.show() | |
elif num > 6 and num < 10: | |
myplot = plt.subplots(3, 3, figsize=(34, 18))[1] | |
myplot2 = [x for sublot in myplot for x in sublot] | |
for x in range(0, num): | |
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count() | |
counter = yo['Label'].sort_values().tail(10) | |
counter = (counter / counter.sum()) * 100 | |
myplot2[x].barh(counter.index, counter, color=colors[x]) | |
text_kwargs = dict(fontsize=26) | |
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs) | |
myplot2[x].tick_params(axis="x", labelsize=24) | |
myplot2[x].tick_params(axis="y", labelsize=19) | |
plt.show() | |
for x in range(2,9): | |
break#make_charts(x) | |
def plot_score(tries: int): | |
scores = [] | |
for center in range(1, tries): | |
kmeans = KMeans(center) | |
model = kmeans.fit(newdf2) | |
scores.append(abs(model.score(newdf2))) | |
centers = list(range(1, tries)) | |
plt.xticks(np.arange(0, tries, step=1)) | |
plt.plot(centers, scores) | |
plt.title("scree Plot") | |
plt.xlabel('Centers') | |
plt.ylabel('Average DIstance From Centroid') | |
plt.show() | |
#plot_score(20) | |
def exec2(): | |
from sklearn import preprocessing | |
scaled = preprocessing.QuantileTransformer().fit_transform(temp2[['Fiber_TD_(g)']]) | |
scaled = pd.DataFrame(scaled, columns=['Fiber_TD_(g)']) | |
temp2['Keyword'] = temp2['Keyword'].astype('category') | |
temp2['Codes'] = temp2[['Keyword']].apply(lambda x: x.cat.codes) | |
unique_codes = temp2[['Codes', 'Keyword']].drop_duplicates() #inplace false | |
unique_codes = unique_codes.sort_values('Codes').set_index('Codes').to_dict()['Keyword'] | |
fig = go.Figure(data=go.Parcoords( | |
line=dict(color = scaled['Fiber_TD_(g)'], | |
colorscale = 'Viridis', | |
showscale = True, | |
cmin = 0, | |
cmax=1, | |
colorbar=dict(title=dict(text="Fiber_TD_(g)", | |
font=dict(size=16)), | |
tickmode="array", | |
tickvals=[0,0.5,1], | |
ticks='outside', | |
ticktext=['Almost None', 'Not a lot', 'A lot'], | |
tickfont=dict(size=yout14))), | |
dimensions=list([ | |
dict(tickvals=[x for x in range(0,36)], ticktext=list(unique_codes.values()), | |
label = 'FoodType', values=temp2['Codes']), #danger | |
dict(range=[0, 100], label='Sugar_Tot_(g)', values=temp2['Sugar_Tot_(g)']), | |
dict(range=[0, 89], label='Protein_(g)', values=temp2['Protein_(g)']), | |
dict(range=[0, 33954], label='Beta_Carot_(µg)', values=temp2['Beta_Carot_(µg)']), | |
dict(range=[0, 68466], label='Vit_A_(µg)', values=temp2['Vit_A_(µg)']), | |
dict(range=[0, 15], label='Copper_(mg)', values=temp2['Copper_(mg)']), | |
dict(range=[0, 781], label='Magnesium_(mg)', values=temp2['Magnesium_(mg)']), | |
dict(range=[0, 100], label='Lipid_Tot_(g)', values=temp2['Lipid_Tot_(g)']), | |
dict(range=[0, 84], | |
label='FA_Mono_(g)', values=temp2['FA_Mono_(g)']), | |
dict(range=[0, 902], label='Energy_(kcal)', values=temp2['Energy_(kcal)']), | |
dict(range=[0, 100], | |
label='Water_(g)', values=temp2['Water_(g)']), | |
]) | |
)) | |
fig.show() | |
"""Excercise 2""" | |
import plotly.figure_factory as ff | |
import numpy as np | |
numpied_df = df.to_numpy() | |
hist_data = [numpied_df.T[0]] | |
group_labels = ['Water_(g)'] | |
# Create distplot with custom bin_size | |
fig = ff.create_distplot(hist_data, group_labels, bin_size=.4, curve_type='normal') | |
fig.update_layout(title_text='Normal Distribution Curve Plot') | |
fig.show() | |
cov_data = np.corrcoef(numpied_df.T) | |
column_categ = df.columns | |
fig = go.Figure(data=go.Heatmap( | |
z=cov_data,x=column_categ, y=column_categ, colorscale='greys' | |
)) | |
fig.update_layout(title={ | |
'text':'Pearson Product-Moment Correlation Coefficients Heatmap', | |
'font':{'size':56, 'color':'#110a54'}, | |
'xanchor':'center', | |
'yanchor':'top', | |
'y':0.975, | |
'x':0.5 | |
}) | |
fig.show() | |
#exec2() | |
def exec3(): | |
from itertools import groupby | |
from operator import itemgetter | |
df = pd.read_excel('USDA_Food_Database.xlsx') | |
#print(df.head()) #inspect some first data | |
#print(df.info()) #find memory usage and the null values on specific columns | |
#print(df.describe(include=[np.number, 'object']).T) # find unique keywords and percentiles and top keywords(Vegies) | |
df3 = df.set_index(['Keyword', 'Name']) | |
#print(df.count(level='Keyword')) | |
df_index_combination_tuple = df3.sort_values('Keyword')["No"].to_dict().keys() | |
df_index_combination_tuple = list(df_index_combination_tuple) | |
#Creates list of subvalues f.e. [Alcohol,[highpercentagealc,lowpercentagealc,alc from companyA,alc from companyB]] | |
df_names_combination_nested_list_tuple = [(k, [x for _, x in g]) for k, g in groupby(df_index_combination_tuple, itemgetter(0))] | |
#spectate subvalues | |
"""Exercice 3 pick 1 failed try""" | |
picked_categories = ['GOOSE', 'DUCK'] | |
id_list = ['GOOSE', 'DUCK'] | |
swap_list =["", ""] | |
split_here = [] | |
for category in df_names_combination_nested_list_tuple: | |
if category[0] in picked_categories: | |
id_list += category[1] | |
split_here.append(len(id_list)) | |
for k in range(2, len(id_list)): | |
if k < split_here[0]: | |
swap_list.append("DUCK") | |
else: | |
swap_list.append("GOOSE") | |
fig=go.Figure(go.Sunburst( | |
labels=id_list, | |
parents=swap_list, | |
marker=dict(colors=['#e5f5f9', '#99d8c9']) | |
)) | |
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) | |
fig.show() | |
"""Ex. 3 non failed try""" | |
categories = ['Water_(g)', 'Energy_(kcal)', 'Protein_(g)', 'Lipid_Tot_(g)', 'Thiamin_(mg)', 'Niacin_(mg)', | |
'Selenium_(µg)', 'Iron_(mg)'] | |
df3 = df.loc[:, categories + ['Keyword', 'Name']].copy() | |
criteria = (df.Keyword == 'DUCK') | (df.Keyword == 'GOOSE') | |
df3 = df3[criteria] | |
""""min max mean median sum count std var | |
size describe nunique idxmin idxmax""" | |
grouped = df3.groupby('Keyword')[categories].agg('mean') | |
grouped.loc['DUCK']['Energy_(kcal)'] = grouped.loc['DUCK']['Energy_(kcal)']/20 | |
grouped.loc['GOOSE']['Energy_(kcal)'] = grouped.loc['GOOSE']['Energy_(kcal)'] / 20 | |
grouped.loc['DUCK']['Water_(g)'] = grouped.loc['DUCK']['Water_(g)'] / 3 | |
grouped.loc['GOOSE']['Water_(g)'] = grouped.loc['GOOSE']['Water_(g)'] / 3 | |
grouped.loc['DUCK']['Iron_(mg)'] = grouped.loc['DUCK']['Iron_(mg)'] * 2 | |
grouped.loc['GOOSE']['Iron_(mg)'] = grouped.loc['GOOSE']['Iron_(mg)'] * 2 | |
grouped.loc['DUCK']['Thiamin_(mg)'] = grouped.loc['DUCK']['Thiamin_(mg)'] * 50 | |
grouped.loc['GOOSE']['Thiamin_(mg)'] = grouped.loc['GOOSE']['Thiamin_(mg)'] * 50 | |
categories = ['Water_(g)/3', 'Energy_(kcal)/20', 'Protein_(g)', 'Lipid_Tot_(g)', 'Thiamin_(mg)*50', 'Niacin_(mg)', | |
'Selenium_(µg)', 'Iron_(mg)*2'] | |
fig = go.Figure() | |
fig.add_trace(go.Scatterpolar( | |
r=grouped.loc['DUCK'].values, | |
theta=categories, | |
fill='toself', | |
name='DUCK' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=grouped.loc['GOOSE'].values, | |
theta=categories, | |
fill='toself', | |
name='GOOSSE' | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
)), | |
showlegend=True | |
) | |
fig.show() | |
"""AVOCADO""" | |
categories = ['Water_(g)', 'Energy_(kcal)', 'Lipid_Tot_(g)', 'Carbohydrt_(g)','Sugar_Tot_(g)', 'Phosphorus_(mg)', | |
'Potassium_(mg)', 'Vit_B6_(mg)', 'Thiamin_(mg)', 'Niacin_(mg)', 'Iron_(mg)', 'Vit_E_(mg)'] | |
categoriediv = [('Water_(g)',5), ('Energy_(kcal)', 10), ('Phosphorus_(mg)', 5), ('Potassium_(mg)', 50)] | |
categoriemul = [('Sugar_Tot_(g)', 4), ('Vit_B6_(mg)', 10), ('Thiamin_(mg)', 100), ('Iron_(mg)', 10)] | |
df4 = df.loc[:, categories + ['Keyword', 'Name']].copy() #categories | |
criteria = (df.Name == 'AVOCADOS,RAW,ALL COMM VAR') | (df.Name == 'AVOCADOS,RAW,CALIFORNIA') |\ | |
(df.Name == 'AVOCADOS,RAW,FLORIDA') | |
df4 = df4[criteria] | |
df4 = df4.groupby('Name')[categories].agg('mean') | |
categories = ['Water_(g)/5', 'Energy_(kcal)/10', 'Lipid_Tot_(g)', 'Carbohydrt_(g)', 'Sugar_Tot_(g)*4', 'Phosphorus_(mg)/5', | |
'Potassium_(mg)/50', 'Vit_B6_(mg)*10', 'Thiamin_(mg)*100', 'Niacin_(mg)', 'Iron_(mg)*10', 'Vit_E_(mg)'] | |
for cat in categoriediv: | |
df4.loc['AVOCADOS,RAW,ALL COMM VAR', cat[0]] = df4.loc['AVOCADOS,RAW,ALL COMM VAR'][cat[0]] / cat[1] | |
df4.loc['AVOCADOS,RAW,CALIFORNIA', cat[0]] = df4.loc['AVOCADOS,RAW,CALIFORNIA'][cat[0]] / cat[1] | |
df4.loc['AVOCADOS,RAW,FLORIDA', cat[0]] = df4.loc['AVOCADOS,RAW,FLORIDA'][cat[0]] / cat[1] | |
for cat in categoriemul: | |
df4.loc['AVOCADOS,RAW,ALL COMM VAR', cat[0]] = df4.loc['AVOCADOS,RAW,ALL COMM VAR'][cat[0]] * cat[1] | |
df4.loc['AVOCADOS,RAW,CALIFORNIA', cat[0]] = df4.loc['AVOCADOS,RAW,CALIFORNIA'][cat[0]] * cat[1] | |
df4.loc['AVOCADOS,RAW,FLORIDA', cat[0]] = df4.loc['AVOCADOS,RAW,FLORIDA'][cat[0]] * cat[1] | |
fig = go.Figure() | |
fig.add_trace(go.Scatterpolar( | |
r=df4.loc['AVOCADOS,RAW,ALL COMM VAR'].values, | |
theta=categories, | |
fill='toself', | |
name='AVOCADOS,RAW,ALL COMM VAR' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=df4.loc['AVOCADOS,RAW,CALIFORNIA'].values, | |
theta=categories, | |
fill='toself', | |
name='AVOCADOS,RAW,CALIFORNIA' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=df4.loc['AVOCADOS,RAW,FLORIDA'].values, | |
theta=categories, | |
fill='toself', | |
name='AVOCADOS,RAW,FLORIDA' | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
)), | |
showlegend=True | |
) | |
fig.show() | |
"""'BLUEBERRIES,FRZ,SWTND', 'BLUEBERRIES,FRZ,UNSWTND', 'BLUEBERRIES,RAW', 'BLUEBERRIES,WILD,FRZ'""" | |
categories = ['Fiber_TD_(g)', 'Calcium_(mg)', 'Iron_(mg)', | |
'Phosphorus_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Manganese_(mg)', 'Vit_A_(µg)'] | |
categoriemul = [('Iron_(mg)', 10), ('Sodium_(mg)', 2), ('Zinc_(mg)', 20), ('Manganese_(mg)', 5)] | |
categoriediv = [('Vit_A_(µg)', 5)] | |
criteria = (df.Name == 'BLUEBERRIES,FRZ,SWTND') | (df.Name == 'BLUEBERRIES,FRZ,UNSWTND') | \ | |
(df.Name == 'BLUEBERRIES,RAW') | (df.Name == 'BLUEBERRIES,WILD,FRZ') | |
df5 = df.loc[:, categories + ['Name']].copy() | |
df5 = df5[criteria] | |
df5 = df5.groupby('Name')[categories].agg('mean') | |
print(df5) | |
categoriesren = ['Fiber_TD_(g)', 'Calcium_(mg)', 'Iron_(mg)*10', | |
'Phosphorus_(mg)', 'Sodium_(mg)*2','Zinc_(mg)*20', 'Manganese_(mg)*5', 'Vit_A_(µg)/5'] | |
for cat in categoriediv: | |
df5.loc['BLUEBERRIES,FRZ,SWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,SWTND'][cat[0]] / cat[1] | |
df5.loc['BLUEBERRIES,FRZ,UNSWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,UNSWTND'][cat[0]] / cat[1] | |
df5.loc['BLUEBERRIES,RAW', cat[0]] = df5.loc['BLUEBERRIES,RAW'][cat[0]] / cat[1] | |
df5.loc['BLUEBERRIES,WILD,FRZ', cat[0]] = df5.loc['BLUEBERRIES,WILD,FRZ'][cat[0]] / cat[1] | |
for cat in categoriemul: | |
df5.loc['BLUEBERRIES,FRZ,SWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,SWTND'][cat[0]] * cat[1] | |
df5.loc['BLUEBERRIES,FRZ,UNSWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,UNSWTND'][cat[0]] * cat[1] | |
df5.loc['BLUEBERRIES,RAW', cat[0]] = df5.loc['BLUEBERRIES,RAW'][cat[0]] * cat[1] | |
df5.loc['BLUEBERRIES,WILD,FRZ', cat[0]] = df5.loc['BLUEBERRIES,WILD,FRZ'][cat[0]] * cat[1] | |
print(df5) | |
fig = go.Figure() | |
fig.add_trace(go.Scatterpolar( | |
r=df5.loc['BLUEBERRIES,FRZ,SWTND'].values, | |
theta=categoriesren, | |
fill='toself', | |
name='BLUEBERRIES,FRZ,SWTND' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=df5.loc['BLUEBERRIES,FRZ,UNSWTND'].values, | |
theta=categoriesren, | |
fill='toself', | |
name='BLUEBERRIES,FRZ,UNSWTND' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=df5.loc['BLUEBERRIES,RAW'].values, | |
theta=categoriesren, | |
fill='toself', | |
name='BLUEBERRIES,RAW' | |
)) | |
fig.add_trace(go.Scatterpolar( | |
r=df5.loc['BLUEBERRIES,WILD,FRZ'].values, | |
theta=categoriesren, | |
fill='toself', | |
name='BLUEBERRIES,WILD,FRZ' | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
)), | |
showlegend=True | |
) | |
fig.show() | |
exec3() | |
#'BLUEBERRIES,FRZ,SWTND', 'BLUEBERRIES,FRZ,UNSWTND', 'BLUEBERRIES,RAW', 'BLUEBERRIES,WILD,FRZ', |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment