Skip to content

Instantly share code, notes, and snippets.

@TheBubblePopped
Created November 19, 2019 21:58
Show Gist options
  • Save TheBubblePopped/9b6a75ea34bc20668a1e5d53103e755c to your computer and use it in GitHub Desktop.
Save TheBubblePopped/9b6a75ea34bc20668a1e5d53103e755c to your computer and use it in GitHub Desktop.
visdat
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn import preprocessing as p
import plotly.graph_objects as go
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 300)
pd.set_option('display.width', 1000)
from matplotlib import colors
colors = {0:'#1b9e77', 1:'#d95f02', 2:'#7570b3', 3:'#e7298a', 4:'#66a61e', 5:'#e6ab02', 6:'#a6761d', 7:'#666666'}
df = pd.read_excel('USDA_Food_Database.xlsx')
temp = df.copy().dropna() #earlycope to not remove keword and name colmn
temp2 = df.copy().dropna()
df.drop(df[['No', 'Name', 'Keyword']], axis=1, inplace=True)
cols = df.columns
df.dropna(inplace=True)
puh2=p.MinMaxScaler().fit_transform(df) # numpy.Narray columnsnames get lost flatten out
newdf2 = pd.DataFrame(puh2) #transform back into df
newdf2.columns = cols #give columns names
newdf2 = newdf2.loc[:,['Water_(g)', 'Sugar_Tot_(g)']].dropna() #locate only 2 features we want to cluister
def fit_kmeans(data, centers):
kmeans = KMeans(centers)
labels = kmeans.fit_predict(data)
return labels
def make_charts(num, tempfunc=temp.copy()):
"""makes first scatterplot"""
labels = fit_kmeans(newdf2, num)
tempfunc = tempfunc.loc[:, ['Keyword', 'Name', 'Water_(g)', 'Sugar_Tot_(g)']].dropna()
tempfunc['Label'] = labels
plt.title('Food Dataset K-Means Clustering')
plt.ylabel('Sugar(g)')
plt.xlabel('Water(g)')
for g in np.unique(labels):
ix = np.where(labels == g)
ix = ix[0]
plt.scatter(tempfunc.iloc[ix]['Water_(g)'], tempfunc.iloc[ix]['Sugar_Tot_(g)'], c=colors[g],
label="Group " + str(g + 1),
alpha=0.3)
plt.legend()
plt.show()
"""creates the fig for the food type"""
if num < 4:
myplot = plt.subplots(num, 1, figsize=(14, 16))[1]
for x in range(0, num):
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count()
counter = yo['Label'].sort_values().tail(10)
counter = (counter / counter.sum()) * 100
myplot[x].barh(counter.index, counter, color=colors[x])
text_kwargs = dict(fontsize=26)
myplot[x].set_title(f"Group {x + 1} in %", **text_kwargs)
myplot[x].tick_params(axis="x", labelsize=30)
myplot[x].tick_params(axis="y", labelsize=20)
plt.show()
elif num == 4:
myplot = plt.subplots(2, 2, figsize=(24, 14))[1]
myplot2 = np.append(myplot[0], myplot[1])
for x in range(0, num):
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count()
counter = yo['Label'].sort_values().tail(10)
counter = (counter / counter.sum()) * 100
myplot2[x].barh(counter.index, counter, color=colors[x])
text_kwargs = dict(fontsize=26)
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs)
myplot2[x].tick_params(axis="x", labelsize=30)
myplot2[x].tick_params(axis="y", labelsize=20)
plt.show()
elif num > 4 and num < 7:
myplot = plt.subplots(3, 2, figsize=(24, 14))[1]
myplot2 = [x for sublot in myplot for x in sublot]
for x in range(0, num):
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count()
counter = yo['Label'].sort_values().tail(10)
counter = (counter / counter.sum()) * 100
myplot2[x].barh(counter.index, counter, color=colors[x])
text_kwargs = dict(fontsize=26)
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs)
myplot2[x].tick_params(axis="x", labelsize=24)
myplot2[x].tick_params(axis="y", labelsize=18)
plt.show()
elif num > 6 and num < 10:
myplot = plt.subplots(3, 3, figsize=(34, 18))[1]
myplot2 = [x for sublot in myplot for x in sublot]
for x in range(0, num):
yo = tempfunc[tempfunc['Label'] == x].groupby('Keyword').count()
counter = yo['Label'].sort_values().tail(10)
counter = (counter / counter.sum()) * 100
myplot2[x].barh(counter.index, counter, color=colors[x])
text_kwargs = dict(fontsize=26)
myplot2[x].set_title(f"Group {x + 1} in %", **text_kwargs)
myplot2[x].tick_params(axis="x", labelsize=24)
myplot2[x].tick_params(axis="y", labelsize=19)
plt.show()
for x in range(2,9):
break#make_charts(x)
def plot_score(tries: int):
scores = []
for center in range(1, tries):
kmeans = KMeans(center)
model = kmeans.fit(newdf2)
scores.append(abs(model.score(newdf2)))
centers = list(range(1, tries))
plt.xticks(np.arange(0, tries, step=1))
plt.plot(centers, scores)
plt.title("scree Plot")
plt.xlabel('Centers')
plt.ylabel('Average DIstance From Centroid')
plt.show()
#plot_score(20)
def exec2():
from sklearn import preprocessing
scaled = preprocessing.QuantileTransformer().fit_transform(temp2[['Fiber_TD_(g)']])
scaled = pd.DataFrame(scaled, columns=['Fiber_TD_(g)'])
temp2['Keyword'] = temp2['Keyword'].astype('category')
temp2['Codes'] = temp2[['Keyword']].apply(lambda x: x.cat.codes)
unique_codes = temp2[['Codes', 'Keyword']].drop_duplicates() #inplace false
unique_codes = unique_codes.sort_values('Codes').set_index('Codes').to_dict()['Keyword']
fig = go.Figure(data=go.Parcoords(
line=dict(color = scaled['Fiber_TD_(g)'],
colorscale = 'Viridis',
showscale = True,
cmin = 0,
cmax=1,
colorbar=dict(title=dict(text="Fiber_TD_(g)",
font=dict(size=16)),
tickmode="array",
tickvals=[0,0.5,1],
ticks='outside',
ticktext=['Almost None', 'Not a lot', 'A lot'],
tickfont=dict(size=yout14))),
dimensions=list([
dict(tickvals=[x for x in range(0,36)], ticktext=list(unique_codes.values()),
label = 'FoodType', values=temp2['Codes']), #danger
dict(range=[0, 100], label='Sugar_Tot_(g)', values=temp2['Sugar_Tot_(g)']),
dict(range=[0, 89], label='Protein_(g)', values=temp2['Protein_(g)']),
dict(range=[0, 33954], label='Beta_Carot_(µg)', values=temp2['Beta_Carot_(µg)']),
dict(range=[0, 68466], label='Vit_A_(µg)', values=temp2['Vit_A_(µg)']),
dict(range=[0, 15], label='Copper_(mg)', values=temp2['Copper_(mg)']),
dict(range=[0, 781], label='Magnesium_(mg)', values=temp2['Magnesium_(mg)']),
dict(range=[0, 100], label='Lipid_Tot_(g)', values=temp2['Lipid_Tot_(g)']),
dict(range=[0, 84],
label='FA_Mono_(g)', values=temp2['FA_Mono_(g)']),
dict(range=[0, 902], label='Energy_(kcal)', values=temp2['Energy_(kcal)']),
dict(range=[0, 100],
label='Water_(g)', values=temp2['Water_(g)']),
])
))
fig.show()
"""Excercise 2"""
import plotly.figure_factory as ff
import numpy as np
numpied_df = df.to_numpy()
hist_data = [numpied_df.T[0]]
group_labels = ['Water_(g)']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.4, curve_type='normal')
fig.update_layout(title_text='Normal Distribution Curve Plot')
fig.show()
cov_data = np.corrcoef(numpied_df.T)
column_categ = df.columns
fig = go.Figure(data=go.Heatmap(
z=cov_data,x=column_categ, y=column_categ, colorscale='greys'
))
fig.update_layout(title={
'text':'Pearson Product-Moment Correlation Coefficients Heatmap',
'font':{'size':56, 'color':'#110a54'},
'xanchor':'center',
'yanchor':'top',
'y':0.975,
'x':0.5
})
fig.show()
#exec2()
def exec3():
from itertools import groupby
from operator import itemgetter
df = pd.read_excel('USDA_Food_Database.xlsx')
#print(df.head()) #inspect some first data
#print(df.info()) #find memory usage and the null values on specific columns
#print(df.describe(include=[np.number, 'object']).T) # find unique keywords and percentiles and top keywords(Vegies)
df3 = df.set_index(['Keyword', 'Name'])
#print(df.count(level='Keyword'))
df_index_combination_tuple = df3.sort_values('Keyword')["No"].to_dict().keys()
df_index_combination_tuple = list(df_index_combination_tuple)
#Creates list of subvalues f.e. [Alcohol,[highpercentagealc,lowpercentagealc,alc from companyA,alc from companyB]]
df_names_combination_nested_list_tuple = [(k, [x for _, x in g]) for k, g in groupby(df_index_combination_tuple, itemgetter(0))]
#spectate subvalues
"""Exercice 3 pick 1 failed try"""
picked_categories = ['GOOSE', 'DUCK']
id_list = ['GOOSE', 'DUCK']
swap_list =["", ""]
split_here = []
for category in df_names_combination_nested_list_tuple:
if category[0] in picked_categories:
id_list += category[1]
split_here.append(len(id_list))
for k in range(2, len(id_list)):
if k < split_here[0]:
swap_list.append("DUCK")
else:
swap_list.append("GOOSE")
fig=go.Figure(go.Sunburst(
labels=id_list,
parents=swap_list,
marker=dict(colors=['#e5f5f9', '#99d8c9'])
))
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
fig.show()
"""Ex. 3 non failed try"""
categories = ['Water_(g)', 'Energy_(kcal)', 'Protein_(g)', 'Lipid_Tot_(g)', 'Thiamin_(mg)', 'Niacin_(mg)',
'Selenium_(µg)', 'Iron_(mg)']
df3 = df.loc[:, categories + ['Keyword', 'Name']].copy()
criteria = (df.Keyword == 'DUCK') | (df.Keyword == 'GOOSE')
df3 = df3[criteria]
""""min max mean median sum count std var
size describe nunique idxmin idxmax"""
grouped = df3.groupby('Keyword')[categories].agg('mean')
grouped.loc['DUCK']['Energy_(kcal)'] = grouped.loc['DUCK']['Energy_(kcal)']/20
grouped.loc['GOOSE']['Energy_(kcal)'] = grouped.loc['GOOSE']['Energy_(kcal)'] / 20
grouped.loc['DUCK']['Water_(g)'] = grouped.loc['DUCK']['Water_(g)'] / 3
grouped.loc['GOOSE']['Water_(g)'] = grouped.loc['GOOSE']['Water_(g)'] / 3
grouped.loc['DUCK']['Iron_(mg)'] = grouped.loc['DUCK']['Iron_(mg)'] * 2
grouped.loc['GOOSE']['Iron_(mg)'] = grouped.loc['GOOSE']['Iron_(mg)'] * 2
grouped.loc['DUCK']['Thiamin_(mg)'] = grouped.loc['DUCK']['Thiamin_(mg)'] * 50
grouped.loc['GOOSE']['Thiamin_(mg)'] = grouped.loc['GOOSE']['Thiamin_(mg)'] * 50
categories = ['Water_(g)/3', 'Energy_(kcal)/20', 'Protein_(g)', 'Lipid_Tot_(g)', 'Thiamin_(mg)*50', 'Niacin_(mg)',
'Selenium_(µg)', 'Iron_(mg)*2']
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=grouped.loc['DUCK'].values,
theta=categories,
fill='toself',
name='DUCK'
))
fig.add_trace(go.Scatterpolar(
r=grouped.loc['GOOSE'].values,
theta=categories,
fill='toself',
name='GOOSSE'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
)),
showlegend=True
)
fig.show()
"""AVOCADO"""
categories = ['Water_(g)', 'Energy_(kcal)', 'Lipid_Tot_(g)', 'Carbohydrt_(g)','Sugar_Tot_(g)', 'Phosphorus_(mg)',
'Potassium_(mg)', 'Vit_B6_(mg)', 'Thiamin_(mg)', 'Niacin_(mg)', 'Iron_(mg)', 'Vit_E_(mg)']
categoriediv = [('Water_(g)',5), ('Energy_(kcal)', 10), ('Phosphorus_(mg)', 5), ('Potassium_(mg)', 50)]
categoriemul = [('Sugar_Tot_(g)', 4), ('Vit_B6_(mg)', 10), ('Thiamin_(mg)', 100), ('Iron_(mg)', 10)]
df4 = df.loc[:, categories + ['Keyword', 'Name']].copy() #categories
criteria = (df.Name == 'AVOCADOS,RAW,ALL COMM VAR') | (df.Name == 'AVOCADOS,RAW,CALIFORNIA') |\
(df.Name == 'AVOCADOS,RAW,FLORIDA')
df4 = df4[criteria]
df4 = df4.groupby('Name')[categories].agg('mean')
categories = ['Water_(g)/5', 'Energy_(kcal)/10', 'Lipid_Tot_(g)', 'Carbohydrt_(g)', 'Sugar_Tot_(g)*4', 'Phosphorus_(mg)/5',
'Potassium_(mg)/50', 'Vit_B6_(mg)*10', 'Thiamin_(mg)*100', 'Niacin_(mg)', 'Iron_(mg)*10', 'Vit_E_(mg)']
for cat in categoriediv:
df4.loc['AVOCADOS,RAW,ALL COMM VAR', cat[0]] = df4.loc['AVOCADOS,RAW,ALL COMM VAR'][cat[0]] / cat[1]
df4.loc['AVOCADOS,RAW,CALIFORNIA', cat[0]] = df4.loc['AVOCADOS,RAW,CALIFORNIA'][cat[0]] / cat[1]
df4.loc['AVOCADOS,RAW,FLORIDA', cat[0]] = df4.loc['AVOCADOS,RAW,FLORIDA'][cat[0]] / cat[1]
for cat in categoriemul:
df4.loc['AVOCADOS,RAW,ALL COMM VAR', cat[0]] = df4.loc['AVOCADOS,RAW,ALL COMM VAR'][cat[0]] * cat[1]
df4.loc['AVOCADOS,RAW,CALIFORNIA', cat[0]] = df4.loc['AVOCADOS,RAW,CALIFORNIA'][cat[0]] * cat[1]
df4.loc['AVOCADOS,RAW,FLORIDA', cat[0]] = df4.loc['AVOCADOS,RAW,FLORIDA'][cat[0]] * cat[1]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=df4.loc['AVOCADOS,RAW,ALL COMM VAR'].values,
theta=categories,
fill='toself',
name='AVOCADOS,RAW,ALL COMM VAR'
))
fig.add_trace(go.Scatterpolar(
r=df4.loc['AVOCADOS,RAW,CALIFORNIA'].values,
theta=categories,
fill='toself',
name='AVOCADOS,RAW,CALIFORNIA'
))
fig.add_trace(go.Scatterpolar(
r=df4.loc['AVOCADOS,RAW,FLORIDA'].values,
theta=categories,
fill='toself',
name='AVOCADOS,RAW,FLORIDA'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
)),
showlegend=True
)
fig.show()
"""'BLUEBERRIES,FRZ,SWTND', 'BLUEBERRIES,FRZ,UNSWTND', 'BLUEBERRIES,RAW', 'BLUEBERRIES,WILD,FRZ'"""
categories = ['Fiber_TD_(g)', 'Calcium_(mg)', 'Iron_(mg)',
'Phosphorus_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Manganese_(mg)', 'Vit_A_(µg)']
categoriemul = [('Iron_(mg)', 10), ('Sodium_(mg)', 2), ('Zinc_(mg)', 20), ('Manganese_(mg)', 5)]
categoriediv = [('Vit_A_(µg)', 5)]
criteria = (df.Name == 'BLUEBERRIES,FRZ,SWTND') | (df.Name == 'BLUEBERRIES,FRZ,UNSWTND') | \
(df.Name == 'BLUEBERRIES,RAW') | (df.Name == 'BLUEBERRIES,WILD,FRZ')
df5 = df.loc[:, categories + ['Name']].copy()
df5 = df5[criteria]
df5 = df5.groupby('Name')[categories].agg('mean')
print(df5)
categoriesren = ['Fiber_TD_(g)', 'Calcium_(mg)', 'Iron_(mg)*10',
'Phosphorus_(mg)', 'Sodium_(mg)*2','Zinc_(mg)*20', 'Manganese_(mg)*5', 'Vit_A_(µg)/5']
for cat in categoriediv:
df5.loc['BLUEBERRIES,FRZ,SWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,SWTND'][cat[0]] / cat[1]
df5.loc['BLUEBERRIES,FRZ,UNSWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,UNSWTND'][cat[0]] / cat[1]
df5.loc['BLUEBERRIES,RAW', cat[0]] = df5.loc['BLUEBERRIES,RAW'][cat[0]] / cat[1]
df5.loc['BLUEBERRIES,WILD,FRZ', cat[0]] = df5.loc['BLUEBERRIES,WILD,FRZ'][cat[0]] / cat[1]
for cat in categoriemul:
df5.loc['BLUEBERRIES,FRZ,SWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,SWTND'][cat[0]] * cat[1]
df5.loc['BLUEBERRIES,FRZ,UNSWTND', cat[0]] = df5.loc['BLUEBERRIES,FRZ,UNSWTND'][cat[0]] * cat[1]
df5.loc['BLUEBERRIES,RAW', cat[0]] = df5.loc['BLUEBERRIES,RAW'][cat[0]] * cat[1]
df5.loc['BLUEBERRIES,WILD,FRZ', cat[0]] = df5.loc['BLUEBERRIES,WILD,FRZ'][cat[0]] * cat[1]
print(df5)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=df5.loc['BLUEBERRIES,FRZ,SWTND'].values,
theta=categoriesren,
fill='toself',
name='BLUEBERRIES,FRZ,SWTND'
))
fig.add_trace(go.Scatterpolar(
r=df5.loc['BLUEBERRIES,FRZ,UNSWTND'].values,
theta=categoriesren,
fill='toself',
name='BLUEBERRIES,FRZ,UNSWTND'
))
fig.add_trace(go.Scatterpolar(
r=df5.loc['BLUEBERRIES,RAW'].values,
theta=categoriesren,
fill='toself',
name='BLUEBERRIES,RAW'
))
fig.add_trace(go.Scatterpolar(
r=df5.loc['BLUEBERRIES,WILD,FRZ'].values,
theta=categoriesren,
fill='toself',
name='BLUEBERRIES,WILD,FRZ'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
)),
showlegend=True
)
fig.show()
exec3()
#'BLUEBERRIES,FRZ,SWTND', 'BLUEBERRIES,FRZ,UNSWTND', 'BLUEBERRIES,RAW', 'BLUEBERRIES,WILD,FRZ',
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment