Skip to content

Instantly share code, notes, and snippets.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
pubs=pd.read_excel('open_pubs.xlsx',sheet_name='open_pubs',converters={'name':str})
cities=pd.read_excel('open_pubs.xlsx',sheet_name='population')
#transform to lower case and transform to strings
pubs['name']=pubs['name'].str.lower()
#count words in names
pubs['word_count'] = pubs['name'].apply(lambda x: len(str(x).split(" ")))
pubs.hist(column=['word_count'],grid=False,bins=10)
print('The average lenght of the Pub names is ',str(pubs['word_count'].mean()),' words.')
#create word cloud
wordcloud = WordCloud(max_words=150, background_color="white",stopwords=stopwords,normalize_plurals=False).generate(text)
plt.figure(dpi=500)
plt.imshow(wordcloud, interpolation="bicubic")
plt.axis("off")
plt.show()
wordcloud.to_file("pub_wordcloud.png")
#load lists
animals=pd.read_excel('animals.xlsx',sheetname='animals',header=None)
animals="|".join(animals[0])
colors=pd.read_excel('animals.xlsx',sheetname='colors',header=None)
colors="|".join(colors[0])
royal=pd.read_excel('animals.xlsx',sheetname='royal',header=None)
royal="|".join(royal[0])
sports=pd.read_excel('animals.xlsx',sheetname='sports',header=None)
sports="|".join(sports[0])
data = [
go.Scattermapbox(
lat=pubs['latitute'],
lon=pubs['longitude'],
mode='markers',
marker=go.scattermapbox.Marker(
size=1.5
),
text=pubs['name'],
)
#get the regional Postcode only
pubs['Postcode']=pubs['zip_code'].str.split(' ').str[0]
#Create a Grouped DF by Postcode
pubs_by_postcode=pd.DataFrame(data=pubs['Postcode'].value_counts())
pubs_by_postcode.reset_index(inplace=True)
pubs_by_postcode.columns=['Postcode','Pubs']
#Append Population and Coordinates; calculate Pubs per 1.000 people
pubs_by_postcode=pubs_by_postcode.merge(cities[['Postcode', 'Latitude', 'Longitude','Town/Area','Population']],how='left',on='Postcode')
#Import and create all models. Tune appropiate parameters for each model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=4,random_state=0)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=10)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#Train Models and get score
train_scores=[]
test_scores=[]
for model in model_list:
model.fit(X_train, y_train)
train_scores.append(model.score(X_train, y_train))
test_scores.append(model.score(X_test, y_test))
scores_df= pd.DataFrame(
{'Model Name': ['tree','logreg','knn','naive_b','forest','gbrt','mlp','svc'],
# Add only 1 column (City)
table1_city=table1.merge(table2[['Person ID','City']],how='left',on='Person ID')
#Add all columns of table 2 (City and language)
table1_all=table1.merge(table2,how='left',on='Person ID')
#One liner to fill Nans
table1_fill_nans=table1.merge(table2,how='left',on='Person ID').fillna('Unknown')