Skip to content

Instantly share code, notes, and snippets.

View L-Lewis's full-sized avatar

Laura Lewis L-Lewis

View GitHub Profile
@L-Lewis
L-Lewis / amenities-set.py
Created May 16, 2019 12:36
Creating a set of all possible amenities in the Airbnb dataset
amenities_list = list(df.amenities)
amenities_list_string = " ".join(amenities_list)
amenities_list_string = amenities_list_string.replace('{', '')
amenities_list_string = amenities_list_string.replace('}', ',')
amenities_list_string = amenities_list_string.replace('"', '')
amenities_set = [x.strip() for x in amenities_list_string.split(',')]
amenities_set = set(amenities_set)
amenities_set
@L-Lewis
L-Lewis / mapping-boroughs.py
Created May 16, 2019 13:21
Using geopandas to plot the number and median price of Airbnb listings in each London borough
import geopandas as gpd
# Importing the London borough boundary GeoJSON file as a dataframe in geopandas
map_df = gpd.read_file('data/neighbourhoods.geojson')
# Creating a dataframe of listing counts and median price by borough
borough_df = pd.DataFrame(df.groupby('borough').size())
borough_df.rename(columns={0: 'number_of_listings'}, inplace=True)
borough_df['median_price'] = df.groupby('borough').price.median().values
@L-Lewis
L-Lewis / infrequent-amenities.py
Created May 16, 2019 14:41
Removing features for infrequent Airbnb amenities
# Produces a list of amenity features where one category (true or false) contains fewer than 10% of listings
infrequent_amenities = []
for col in df.iloc[:,41:].columns:
if df[col].sum() < len(df)/10:
infrequent_amenities.append(col)
print(infrequent_amenities)
# Dropping infrequent amenity features
df.drop(infrequent_amenities, axis=1, inplace=True)
@L-Lewis
L-Lewis / airbnb-xgboost.py
Last active May 17, 2019 10:24
Fitting and evaluating an XGBoost regression model for the Airbnb data
import xgboost as xgb
# Fitting the model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)
# Printing the results
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
@L-Lewis
L-Lewis / three-layer-nn.py
Last active May 16, 2019 20:13
Building, compiling and visualising a three-layer neural network
from keras import models, layers, optimizers, regularizers
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
# Building the model
nn2 = models.Sequential()
nn2.add(layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
nn2.add(layers.Dense(256, activation='relu'))
nn2.add(layers.Dense(256, activation='relu'))
nn2.add(layers.Dense(1, activation='linear'))
@L-Lewis
L-Lewis / nn-evaluation.py
Created May 16, 2019 20:15
Function for evaluating a neural network for regression
def nn_model_evaluation(model, skip_epochs=0, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
"""
For a given neural network model that has already been fit, prints for the train and tests sets the MSE and r squared
values, a line graph of the loss in each epoch, and a scatterplot of predicted vs. actual values with a line
representing where predicted = actual values. Optionally, a value for skip_epoch can be provided, which skips that
number of epochs in the line graph of losses (useful in cases where the loss in the first epoch is orders of magnitude
larger than subsequent epochs). Training and test sets can also optionally be specified.
"""
# MSE and r squared values
# Building the first model iteration
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64 ,64, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(32, (4, 4), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
# Getting the images and rescaling
image_folder = 'model3_images/'
image_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
image_folder, shuffle=False, class_mode='binary',
target_size=(128, 128), batch_size=20000)
images, labels = next(image_generator)
# Output: Found 20000 images belonging to 2 classes.
from sklearn.preprocessing import MinMaxScaler
def process_structured_data(df, train, test):
"""
Pre-processes the given dataframe by minmaxscaling the continuous features
(fit-transforming the training data and transforming the test data)
"""
continuous = ["population_per_hectare", "bicycle_aadf", "motor_vehicle_aadf"]
cs = MinMaxScaler()
trainX = cs.fit_transform(train[continuous])
from keras.models import Sequential
from keras.layers.core import Dense
def create_mlp(dim, regularizer=None):
"""Creates a simple two-layer MLP with inputs of the given dimension"""
model = Sequential()
model.add(Dense(8, input_dim=dim, activation="relu", kernel_regularizer=regularizer))
model.add(Dense(4, activation="relu", kernel_regularizer=regularizer))
return model