Laura Lewis L-Lewis

## amenities-set.py
amenities_list = list(df.amenities)
amenities_list_string = " ".join(amenities_list)
amenities_list_string = amenities_list_string.replace('{', '')
amenities_list_string = amenities_list_string.replace('}', ',')
amenities_list_string = amenities_list_string.replace('"', '')
amenities_set = [x.strip() for x in amenities_list_string.split(',')]
amenities_set = set(amenities_set)
amenities_set

## mapping-boroughs.py
import geopandas as gpd

# Importing the London borough boundary GeoJSON file as a dataframe in geopandas
map_df = gpd.read_file('data/neighbourhoods.geojson')

# Creating a dataframe of listing counts and median price by borough
borough_df = pd.DataFrame(df.groupby('borough').size())
borough_df.rename(columns={0: 'number_of_listings'}, inplace=True)
borough_df['median_price'] = df.groupby('borough').price.median().values

## infrequent-amenities.py
# Produces a list of amenity features where one category (true or false) contains fewer than 10% of listings
infrequent_amenities = []
for col in df.iloc[:,41:].columns:
    if df[col].sum() < len(df)/10:
        infrequent_amenities.append(col)
print(infrequent_amenities)

# Dropping infrequent amenity features
df.drop(infrequent_amenities, axis=1, inplace=True)

## airbnb-xgboost.py
import xgboost as xgb

# Fitting the model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)

# Printing the results
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")

## three-layer-nn.py
from keras import models, layers, optimizers, regularizers
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG

# Building the model
nn2 = models.Sequential()
nn2.add(layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
nn2.add(layers.Dense(256, activation='relu'))
nn2.add(layers.Dense(256, activation='relu'))
nn2.add(layers.Dense(1, activation='linear'))

## nn-evaluation.py
def nn_model_evaluation(model, skip_epochs=0, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    """
    For a given neural network model that has already been fit, prints for the train and tests sets the MSE and r squared
    values, a line graph of the loss in each epoch, and a scatterplot of predicted vs. actual values with a line
    representing where predicted = actual values. Optionally, a value for skip_epoch can be provided, which skips that
    number of epochs in the line graph of losses (useful in cases where the loss in the first epoch is orders of magnitude
    larger than subsequent epochs). Training and test sets can also optionally be specified.
    """

    # MSE and r squared values

## traffic-accidents-model2-v1.py
# Building the first model iteration
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64 ,64,  3)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(32, (4, 4), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

## traffic-model3-preparing-data.py
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd

# Getting the images and rescaling
image_folder = 'model3_images/'
image_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        image_folder, shuffle=False, class_mode='binary',
        target_size=(128, 128), batch_size=20000)
images, labels = next(image_generator)
# Output: Found 20000 images belonging to 2 classes.

## traffic-model3-process-structured-data.py
from sklearn.preprocessing import MinMaxScaler

def process_structured_data(df, train, test):
    """
    Pre-processes the given dataframe by minmaxscaling the continuous features
    (fit-transforming the training data and transforming the test data)
    """
    continuous = ["population_per_hectare", "bicycle_aadf", "motor_vehicle_aadf"]
    cs = MinMaxScaler()
    trainX = cs.fit_transform(train[continuous])

## traffic-model3-create-mlp.py
from keras.models import Sequential
from keras.layers.core import Dense

def create_mlp(dim, regularizer=None):
    """Creates a simple two-layer MLP with inputs of the given dimension"""
    model = Sequential()
    model.add(Dense(8, input_dim=dim, activation="relu", kernel_regularizer=regularizer))
    model.add(Dense(4, activation="relu", kernel_regularizer=regularizer))
    return model
	amenities_list = list(df.amenities)
	amenities_list_string = " ".join(amenities_list)
	amenities_list_string = amenities_list_string.replace('{', '')
	amenities_list_string = amenities_list_string.replace('}', ',')
	amenities_list_string = amenities_list_string.replace('"', '')
	amenities_set = [x.strip() for x in amenities_list_string.split(',')]
	amenities_set = set(amenities_set)
	amenities_set
	import geopandas as gpd

	# Importing the London borough boundary GeoJSON file as a dataframe in geopandas
	map_df = gpd.read_file('data/neighbourhoods.geojson')

	# Creating a dataframe of listing counts and median price by borough
	borough_df = pd.DataFrame(df.groupby('borough').size())
	borough_df.rename(columns={0: 'number_of_listings'}, inplace=True)
	borough_df['median_price'] = df.groupby('borough').price.median().values
	# Produces a list of amenity features where one category (true or false) contains fewer than 10% of listings
	infrequent_amenities = []
	for col in df.iloc[:,41:].columns:
	if df[col].sum() < len(df)/10:
	infrequent_amenities.append(col)
	print(infrequent_amenities)

	# Dropping infrequent amenity features
	df.drop(infrequent_amenities, axis=1, inplace=True)
	import xgboost as xgb

	# Fitting the model
	xgb_reg = xgb.XGBRegressor()
	xgb_reg.fit(X_train, y_train)
	training_preds_xgb_reg = xgb_reg.predict(X_train)
	val_preds_xgb_reg = xgb_reg.predict(X_test)

	# Printing the results
	print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
	from keras import models, layers, optimizers, regularizers
	from keras.utils.vis_utils import model_to_dot
	from IPython.display import SVG

	# Building the model
	nn2 = models.Sequential()
	nn2.add(layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
	nn2.add(layers.Dense(256, activation='relu'))
	nn2.add(layers.Dense(256, activation='relu'))
	nn2.add(layers.Dense(1, activation='linear'))
	def nn_model_evaluation(model, skip_epochs=0, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
	"""
	For a given neural network model that has already been fit, prints for the train and tests sets the MSE and r squared
	values, a line graph of the loss in each epoch, and a scatterplot of predicted vs. actual values with a line
	representing where predicted = actual values. Optionally, a value for skip_epoch can be provided, which skips that
	number of epochs in the line graph of losses (useful in cases where the loss in the first epoch is orders of magnitude
	larger than subsequent epochs). Training and test sets can also optionally be specified.
	"""

	# MSE and r squared values
	# Building the first model iteration
	model = models.Sequential()
	model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64 ,64, 3)))
	model.add(layers.MaxPooling2D((2, 2)))

	model.add(layers.Conv2D(32, (4, 4), activation='relu'))
	model.add(layers.MaxPooling2D((2, 2)))

	model.add(layers.Conv2D(64, (3, 3), activation='relu'))
	model.add(layers.MaxPooling2D((2, 2)))
	from keras.preprocessing.image import ImageDataGenerator
	import pandas as pd

	# Getting the images and rescaling
	image_folder = 'model3_images/'
	image_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
	image_folder, shuffle=False, class_mode='binary',
	target_size=(128, 128), batch_size=20000)
	images, labels = next(image_generator)
	# Output: Found 20000 images belonging to 2 classes.
	from sklearn.preprocessing import MinMaxScaler

	def process_structured_data(df, train, test):
	"""
	Pre-processes the given dataframe by minmaxscaling the continuous features
	(fit-transforming the training data and transforming the test data)
	"""
	continuous = ["population_per_hectare", "bicycle_aadf", "motor_vehicle_aadf"]
	cs = MinMaxScaler()
	trainX = cs.fit_transform(train[continuous])
	from keras.models import Sequential
	from keras.layers.core import Dense

	def create_mlp(dim, regularizer=None):
	"""Creates a simple two-layer MLP with inputs of the given dimension"""
	model = Sequential()
	model.add(Dense(8, input_dim=dim, activation="relu", kernel_regularizer=regularizer))
	model.add(Dense(4, activation="relu", kernel_regularizer=regularizer))
	return model