This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
pubs=pd.read_excel('open_pubs.xlsx',sheet_name='open_pubs',converters={'name':str}) | |
cities=pd.read_excel('open_pubs.xlsx',sheet_name='population') | |
#transform to lower case and transform to strings | |
pubs['name']=pubs['name'].str.lower() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#count words in names | |
pubs['word_count'] = pubs['name'].apply(lambda x: len(str(x).split(" "))) | |
pubs.hist(column=['word_count'],grid=False,bins=10) | |
print('The average lenght of the Pub names is ',str(pubs['word_count'].mean()),' words.') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#create word cloud | |
wordcloud = WordCloud(max_words=150, background_color="white",stopwords=stopwords,normalize_plurals=False).generate(text) | |
plt.figure(dpi=500) | |
plt.imshow(wordcloud, interpolation="bicubic") | |
plt.axis("off") | |
plt.show() | |
wordcloud.to_file("pub_wordcloud.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load lists | |
animals=pd.read_excel('animals.xlsx',sheetname='animals',header=None) | |
animals="|".join(animals[0]) | |
colors=pd.read_excel('animals.xlsx',sheetname='colors',header=None) | |
colors="|".join(colors[0]) | |
royal=pd.read_excel('animals.xlsx',sheetname='royal',header=None) | |
royal="|".join(royal[0]) | |
sports=pd.read_excel('animals.xlsx',sheetname='sports',header=None) | |
sports="|".join(sports[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = [ | |
go.Scattermapbox( | |
lat=pubs['latitute'], | |
lon=pubs['longitude'], | |
mode='markers', | |
marker=go.scattermapbox.Marker( | |
size=1.5 | |
), | |
text=pubs['name'], | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#get the regional Postcode only | |
pubs['Postcode']=pubs['zip_code'].str.split(' ').str[0] | |
#Create a Grouped DF by Postcode | |
pubs_by_postcode=pd.DataFrame(data=pubs['Postcode'].value_counts()) | |
pubs_by_postcode.reset_index(inplace=True) | |
pubs_by_postcode.columns=['Postcode','Pubs'] | |
#Append Population and Coordinates; calculate Pubs per 1.000 people | |
pubs_by_postcode=pubs_by_postcode.merge(cities[['Postcode', 'Latitude', 'Longitude','Town/Area','Population']],how='left',on='Postcode') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import and create all models. Tune appropiate parameters for each model | |
from sklearn.tree import DecisionTreeClassifier | |
tree = DecisionTreeClassifier(max_depth=4,random_state=0) | |
from sklearn.linear_model import LogisticRegression | |
logreg = LogisticRegression(C=10) | |
from sklearn.neighbors import KNeighborsClassifier | |
knn = KNeighborsClassifier(n_neighbors=1) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Train Models and get score | |
train_scores=[] | |
test_scores=[] | |
for model in model_list: | |
model.fit(X_train, y_train) | |
train_scores.append(model.score(X_train, y_train)) | |
test_scores.append(model.score(X_test, y_test)) | |
scores_df= pd.DataFrame( | |
{'Model Name': ['tree','logreg','knn','naive_b','forest','gbrt','mlp','svc'], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add only 1 column (City) | |
table1_city=table1.merge(table2[['Person ID','City']],how='left',on='Person ID') | |
#Add all columns of table 2 (City and language) | |
table1_all=table1.merge(table2,how='left',on='Person ID') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#One liner to fill Nans | |
table1_fill_nans=table1.merge(table2,how='left',on='Person ID').fillna('Unknown') |
OlderNewer