Skip to content

Instantly share code, notes, and snippets.

@vb100
Created June 27, 2018 13:22
Show Gist options
  • Save vb100/10bfabacc78920480ec8ebea15db2062 to your computer and use it in GitHub Desktop.
Save vb100/10bfabacc78920480ec8ebea15db2062 to your computer and use it in GitHub Desktop.
This is one of the biggest Machine learning project 100 % made by me. This module read periodically updated Training set, analyze it by performing Hyperparameter Tuning for Decision Tree/Random Forest and set the best selected hyperparameters to the classifier. Then calculate probabilities for a property to be a comps, construct Panda dataframe …
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 21 14:26:09 2018
@author: Vytautas.Bielinskas
Definitions:
JN - Jupyter Notebook
ML - Machine learning
BOG - Bag Of Words
RF - Random Forest
"""
# Get current directory
import os
cwd = os.getcwd()
print('The current directory is {}.'.format(cwd))
# ::: Use Pandas to load the data in a dataframe :::
import pandas as pd
training_set = pd.read_excel('training_set_20180625.xlsx', index_col = 0)
# ::: Clean dataset :::
# Data pre-process and data cleaning was performed in JP
print('The shape of training set after data cleaning is {}.'.format(training_set.shape))
# ::: Extract target from the features :::
y = training_set['Comps'].values
X = training_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']].values
# -----------------------------------------------------------------------------
# ::: ------------ RANDOM FOREST: PREPARATION FOR ACTION ----------------- :::
# ::: Feature scaling :::
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_RF = sc.fit_transform(X)
#X_test_RF = sc.transform(X_test)
# ::: Split dataset into train and test sets :::
""" This part is neccessary for checking accuracies """
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_RF, y, test_size = 0.2, random_state = 42)
print('Size of train dataset {} rows.'.format(X_train.shape[0]))
print('Size of test dataset {} rows.'.format(X_test.shape[0]))
N = 50
accur = []
record = {}
best_hyper = {}
this_acc = 0.0
# ::: Import Classifiers :::
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# ::: Array of Machine learning classifiers if experimenting with more than one :::
ML_classifiers = []
ML_classifiers.append(DecisionTreeClassifier())
ML_classifiers.append(RandomForestClassifier())
iteration = 0
while iteration < N:
# ::: Hyperparameter Tuning :::
from scipy.stats import randint
param_dist = {'max_depth' : [3, None],
'min_samples_leaf' : randint(1, 9),
'criterion' : ['gini', 'entropy']}
# ::: Instantiate a Decision Tree classifier : tree :::
tree = ML_classifiers[1] # <------ Choose your ML classifier here!
# ::: Instantiate the RandomizedSearchCV object: tree_cv :::
from sklearn.model_selection import RandomizedSearchCV
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5)
# ::: Fit it to the data :::
tree_cv.fit(X_train, y_train)
# ::: Print the tuned parameters and score :::
print(' ---------------')
print('Sample: {}.'.format(iteration))
print('Tuned Decision Tree Parameters: {}.'.format(tree_cv.best_params_))
print('Best score is {}.'.format(tree_cv.best_score_))
record["Best score"] = tree_cv.best_score_
accur.append(dict(record))
# ::: Predicting the Test set results :::
y_pred = tree_cv.predict(X_test)
# ::: Making Confusion Matrix :::
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
if tree_cv.best_score_ > this_acc:
print('!!! Current acc: {}, got new value is {}.'.format(this_acc, tree_cv.best_score_))
best_hyper['criterion'] = tree_cv.best_params_['criterion']
best_hyper['max_depth'] = tree_cv.best_params_['max_depth']
best_hyper['min_samples_leaf'] = tree_cv.best_params_['min_samples_leaf']
best_hyper['_best_score_'] = tree_cv.best_score_
this_acc = tree_cv.best_score_
# ::: Iteration +1 :::
iteration = iteration + 1
acc_df = pd.DataFrame(accur)
print('\nAverage accuracy is: {}.'.format(acc_df['Best score'].mean()))
print('Maximum accuracy is: {}.'.format(acc_df['Best score'].max()))
print('The difference is: {}.'.format(acc_df['Best score'].mean() - acc_df['Best score'].max()))
print('Selected features: {}'.format(best_hyper))
# ------------------------------->
# Plot accuracies
def plot_acc(acc_df):
N = len(acc_df)
import seaborn as sns
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
# Generate plot
sns.set_style('darkgrid')
# Set plot size
_ = plt.rcParams['figure.figsize'] = (7,5)
_ = plt.hist(acc_df.values, color = '#1f4e79')
_ = plt.margins(0.02)
# Label the axes
_ = plt.xlabel('Test Accuracy', fontsize = 16, family='Arial')
_ = plt.ylabel('Samples', fontsize = 16)
# Display the plot
plt.show()
return None
""" !!! Open following line only for testing !!! """
plot_acc(acc_df)
# -----------------------------------------------------------------------------
# ---------------- SET THE BEST HYPERPARAMETERS TO THE CLASSIFIER -------------
param_dist = {'max_depth' : best_hyper['max_depth'],
'min_samples_leaf' : best_hyper['min_samples_leaf'],
'criterion' : best_hyper['criterion']}
if 'DecisionTree'.upper() in str(tree).split('(')[0].upper():
classifier = DecisionTreeClassifier(max_depth = param_dist['max_depth'],
min_samples_leaf = param_dist['min_samples_leaf'],
criterion = param_dist['criterion'])
print('\nDecision Tree classifier is using now...')
elif 'RandomForest'.upper() in str(tree).split('(')[0].upper():
classifier = RandomForestClassifier(max_depth = param_dist['max_depth'],
min_samples_leaf = param_dist['min_samples_leaf'],
criterion = param_dist['criterion'])
print('\nRandom Forest Classifier is using now...\n')
# ::: Import Test dataset :::
test_set = pd.read_csv(cwd + '/Tests/TestSet_20180626.csv')
# --------------------- RANDOM FOREST IMPLEMENTATION --------------------------
# -----------------------------------------------------------------------------
# ---------------------RE-ARANGE TRAINING AND TEST SETS -----------------------
# ::: Extract features from Test set :::
X_test_RF = test_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']] # <-- Pre-process for EVALUATION in JN!
# ::: Feature Scaling to Test set (Real data) :::
X_test_RF_scaled = sc.fit_transform(X_test_RF)
# ::: Add extra column for Comps identification :::
import numpy as np
z = np.zeros((len(X_test_RF), 1), dtype = 'float64')
""" Here zero (0) is default value for Comps. Later this field can be changed based on prediction on RF. """
# <-----X_test_RF_scaled = np.append(X_test_RF_scaled, z, axis = 1)----->
# ::: Fit classifier to the Train set. Let algorithm learn right now!
classifier.fit(X_train_RF, y)
# ::: Predicting the Test set results :::
y_pred = classifier.predict(X_test_RF_scaled)
# ::: Get values of Probabilities :::
probas = classifier.predict_proba(X_test_RF_scaled)
# ::: Get importances of features :::
importances = classifier.feature_importances_
# ::: Print the feature ranking :::
indices = np.argsort(importances)[::1]
print('Feature ranking:')
for f in range(X_test_RF.shape[1]):
print('%d. feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))
# ----------------------- SOME RESULTS VISUALIZATIONS -------------------------
# ::: ------------- Plot the feature importances of the forest -------------:::
import matplotlib.pyplot as plt
try:
plt.figure()
plt.title('Feature importances')
std = np.std([tree.feature_importances_ for tree in classifier.estimators_],
axis = 0)
plt.bar(range(X_test_RF.shape[1]), importances[indices],
color = '#1f4e79',
yerr = std[indices],
align = 'center')
plt.xticks(range(X_test_RF.shape[1]), indices)
plt.xlim([-1, X_test_RF.shape[1]])
plt.show()
except:
print('\n Feature importance graph can be plotted only for Random Forest Classifier. \n')
# ::: ------------------------ Plot the Scatter -------------------------:::
fig, ax = plt.subplots()
x = X_test_RF_scaled[:, 1]
y = X_test_RF_scaled[:, 2]
size_multiplier = 500
for point in range(0, len(X_test_RF_scaled), 1):
if y_pred[point] == 1.0:
ax.scatter(x[point], y[point],
c = '#5b9bd5',
s = probas[point][1] * size_multiplier,
label = 'Result',
alpha = 0.50,
edgecolors = 'none')
else:
ax.scatter(x[point],
y[point],
c = '#ff8080',
s = probas[point][0] * size_multiplier,
label = 'Result',
alpha = 0.50,
edgecolors = 'none')
plt.xlabel('Distance to the nearest transport hub', family = 'Arial')
plt.ylabel('Evaluation of description', family = 'Arial')
ax.grid(True)
plt.show()
# -------------------------- STRUCTURING DATA ---------------------------------
# ::: Create Pandas Dataframe for writing results in Excel :::
list_of_data = []
record_data = {}
index_of_ID = test_set.columns.get_loc('ID')
index_of_Address = test_set.columns.get_loc('Address')
index_of_Bedrooms = test_set.columns.get_loc('Bedrooms')
index_of_Bathrooms = test_set.columns.get_loc('Bathrooms')
index_of_Reception = test_set.columns.get_loc('Reception')
index_of_Size = test_set.columns.get_loc('Size')
index_of_flisted = test_set.columns.get_loc('First Listed')
index_of_OrigPrice = test_set.columns.get_loc('Original price')
index_of_CurrPrice = test_set.columns.get_loc('Current price')
index_of_URL = test_set.columns.get_loc('URL')
index_of_FloorLink = test_set.columns.get_loc('FLOORPLAN LINK')
index_of_New = test_set.columns.get_loc('New')
# Set list of possible areas
areas = []
areas.append('West Drayton')
areas.append('Ilford')
area_value = areas[1] # <----- Choose your area index here!
for row in range(0, len(X_test_RF_scaled), 1):
record_data['ID'] = test_set.iat[row, index_of_ID]
record_data['AREA'] = area_value
record_data['ADDRESS'] = test_set.iat[row, index_of_Address]
record_data['BEDROOMS'] = test_set.iat[row, index_of_Bedrooms]
record_data['BATHROOMS'] = test_set.iat[row, index_of_Bathrooms]
record_data['RECEPTION'] = test_set.iat[row, index_of_Reception]
record_data['SIZE'] = test_set.iat[row, index_of_Size]
record_data['FIRST LISTED'] = test_set.iat[row, index_of_flisted]
record_data['ORIGINAL PRICE'] = test_set.iat[row, index_of_OrigPrice]
record_data['CURRENT PRICE'] = test_set.iat[row, index_of_CurrPrice]
record_data['WEB'] = test_set.iat[row, index_of_URL]
record_data['FLOORPLAN LINK'] = test_set.iat[row, index_of_FloorLink]
record_data['NEW'] = test_set.iat[row, index_of_New]
record_data['0'] = probas[row][0]
record_data['1'] = probas[row][1]
list_of_data.append(dict(record_data))
DF = pd.DataFrame(list_of_data)
DF = DF[['ID', 'AREA', 'ADDRESS', 'BEDROOMS', 'BATHROOMS', 'RECEPTION', 'SIZE',
'FIRST LISTED', 'ORIGINAL PRICE', 'CURRENT PRICE', 'WEB',
'FLOORPLAN LINK', 'NEW', '1', '0']]
DF = DF.sort_values(by=['0'])
#------------------------------------------------------------------------------
#--------------------------- UPDATING EXCEL FILE ------------------------------
"""
All new data will be written to File_before.xlsx Excel file
"""
def updateExcel(dataset, area_value):
print('# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #')
print('Writing data to Excel file...')
import requests, re, os
import pandas as pd
from bs4 import BeautifulSoup
import openpyxl, os, datetime
from pandas import ExcelWriter as ewriter
from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font, Fill
from openpyxl.comments import Comment
""" checking if property is NOT LAND : start """
def ifNotLand(url):
land = False
r = requests.get(url)
c = r.content
soup = BeautifulSoup(c, "html.parser")
title = soup.find("title").text
if "land for sale".upper() in title.upper():
land = True
return land
""" checking if property is NOT LAND : end """
directory = (cwd + '/Excel')
os.chdir(directory)
os.getcwd()
filename = "File_before.xlsx"
wb = openpyxl.load_workbook(filename, data_only = False)
#print(wb.get_sheet_names)
print("---- Start update Excel file ----")
sheet = wb.get_sheet_by_name("Data Sales")
rowForSearching = 10
while len(str(sheet.cell(row = rowForSearching, column = 2).value)) > 4:
rowForSearching = rowForSearching + 1
end_of_table = rowForSearching - 1 # Get the end of current table
""" Define some Excel cell styles and formatting """
fillDefault = PatternFill(fill_type = None,
start_color = "FFFFFFFF",
end_color = "FF000000")
fillGreen = PatternFill(fill_type = "solid",
start_color = "56af11",
end_color = "56af11")
HyperlinkBlue = Font(color = "0563c1",
underline = "single")
rightAligment = Alignment(horizontal = "right")
centerAligment = Alignment(horizontal = "center")
""" Starting writing data to Excel file """
id_column = dataset.columns.get_loc("ID") # get index of ID column
id_area = dataset.columns.get_loc("AREA") # get index of STATION column
id_address = dataset.columns.get_loc("ADDRESS") # get index of ADDRESS column
id_bedrooms = dataset.columns.get_loc("BEDROOMS") # get index of BEDROOMS column
id_bathrooms = dataset.columns.get_loc("BATHROOMS") # get index of BATHROOMS column
id_reception = dataset.columns.get_loc("RECEPTION") # get index of RECEPTION column
id_size = dataset.columns.get_loc("SIZE") # get index of SIZE column
id_firstListed = dataset.columns.get_loc("FIRST LISTED") # get index of FIRST LISTED column
id_origPrice = dataset.columns.get_loc("ORIGINAL PRICE")# get index of ORIGINAL PRICE column
id_currPrice = dataset.columns.get_loc("CURRENT PRICE") # get index of CURRENT PRICE column
id_link = dataset.columns.get_loc("WEB") # get index of WEB column
id_floorplan = dataset.columns.get_loc("FLOORPLAN LINK")# get index of FLOORPLAN LINK column
id_new = dataset.columns.get_loc("NEW") # get index of NEW column
id_1 = dataset.columns.get_loc("1") # get index of <<1>> column
id_0 = dataset.columns.get_loc("0") # get index of <<0>> column
for x in range(rowForSearching, rowForSearching + len(dataset), 1):
for j in range(1, sheet.max_column, 1):
sheet.cell(row = x, column = j).fill = fillDefault
dataset_index = 0
property_index = rowForSearching
for i in range(rowForSearching, rowForSearching + len(dataset), 1):
link_to_property = dataset.iat[dataset_index, id_link]
land = ifNotLand(link_to_property)
duplicates = 0
if land == False and "POA" not in str(dataset.iat[dataset_index, id_currPrice]).upper():
""" writing MONTH column """
d = datetime.datetime.strptime(str(datetime.date.today()), '%Y-%m-%d')
d= d.strftime('%d-%b-%Y').replace("-20", "-")
sheet.cell(row = property_index, column = 2).value = d
sheet.cell(row = property_index, column = 2).alignment = rightAligment
""" writing SIZE columns """
size = dataset.iat[dataset_index, id_size]
if str(size).replace(" ", "") == "0":
sheet.cell(row = property_index, column = 9).value = ""
else:
sheet.cell(row = property_index, column = 9).value = dataset.iat[dataset_index, id_size]
""" writing ORIGINAL PRICE columns """
sheet.cell(row = property_index, column = 11).value = dataset.iat[dataset_index, id_origPrice]
sheet.cell(row = property_index, column = 11).alignment = rightAligment
""" writing ID column """
sheet.cell(row = property_index, column = 3).value = dataset.iat[dataset_index, id_column]
sheet.cell(row = property_index, column = 3).alignment = rightAligment
""" checking past data by ID : start --> """
for past_row in range(10, property_index-1, 1):
past_id = str(sheet.cell(row = past_row, column = 3).value)
if str(sheet.cell(row = property_index, column = 3).value) == past_id:
duplicates = duplicates + 1
sheet.cell(row = property_index, column = 3).fill = fillGreen
formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))'
size = sheet.cell(row = past_row, column = 9).value
sheet.cell(row = property_index, column = 9).value = size
sheet.cell(row = property_index, column = 9).fill = fillGreen
print("Duplicate on row:", past_row, ", new property is on", property_index, "row")
""" checking past data by ID : end --> """
""" writing AREA column """
sheet.cell(row = property_index, column = 4).value = dataset.iat[dataset_index, id_area]
""" writing ADDRESS column """
sheet.cell(row = property_index, column = 5).value = dataset.iat[dataset_index, id_address]
""" writing BEDROOMS column """
sheet.cell(row = property_index, column = 6).value = dataset.iat[dataset_index, id_bedrooms]
""" writing BATHROOMS columns """
sheet.cell(row = property_index, column = 7).value = dataset.iat[dataset_index, id_bathrooms]
""" writing RECEPTION columns """
sheet.cell(row = property_index, column = 8).value = dataset.iat[dataset_index, id_reception]
""" writing FIRST LISTED columns """
sheet.cell(row = property_index, column = 10).value = dataset.iat[dataset_index, id_firstListed]
sheet.cell(row = property_index, column = 10).alignment = rightAligment
""" writing FLOOR PLAN LINK columns """
FullLink = ""
if ".com" in dataset.iat[dataset_index, id_floorplan]:
FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_floorplan] + '","' + "Floor" + '")'
sheet.cell(row = property_index, column = 15).font = HyperlinkBlue
else:
FullLink = ""
sheet.cell(row = property_index, column = 15).value = FullLink
sheet.cell(row = property_index, column = 15).alignment = centerAligment
""" writing CURRENT PRICE to columns """
sheet.cell(row = property_index, column = 12).value = dataset.iat[dataset_index, id_currPrice]
sheet.cell(row = property_index, column = 12).alignment = rightAligment
""" writing LINK to column """
FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_link] + '","' + "Link" + '")'
sheet.cell(row = property_index, column = 16).value = FullLink
sheet.cell(row = property_index, column = 16).font = HyperlinkBlue
""" writing LINK URL to column """
sheet.cell(row = property_index, column = 18).value = dataset.iat[dataset_index, id_link]
""" writing NEW columns """
sheet.cell(row = property_index, column = 21).value = dataset.iat[dataset_index, id_new]
""" writing Prob <<1>> column """
sheet.cell(row = property_index, column = 28).value = dataset.iat[dataset_index, id_1]
sheet.cell(row = property_index, column = 28).fill = fillDefault
""" writing Prob <<0>> column """
sheet.cell(row = property_index, column = 29).value = dataset.iat[dataset_index, id_0]
sheet.cell(row = property_index, column = 29).fill = fillDefault
dataset_index = dataset_index + 1 #Go to next row
property_index = property_index + 1
""" Write formulas : start """
formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))'
sheet.cell(row = property_index, column = 27).value = formula_for_comps
sheet.cell(row = property_index, column = 27).fill = fillGreen
formula_check_size = '=IF(AND(Y' + str(property_index) + '="new", I' + str(property_index) + '=""), "check", IF(I' + str(property_index) + '="", Y' + str(property_index) + ',I' + str(property_index) + '))'
sheet.cell(row = property_index, column = 26).value = formula_check_size
sheet.cell(row = property_index, column = 26).fill = fillGreen
""" Write formulas : end """
""" Calculation: DISCOUNT column """
sheet.cell(row = property_index, column = 13).value = '=IFERROR((L' + str(property_index) + '-K' + str(property_index) + ')/K' + str(property_index)+ ',"")'
sheet.cell(row = property_index, column = 13).number_format = "0%"
""" Calculation: PSF column """
sheet.cell(row = property_index, column = 14).value = '=IFERROR(L' + str(property_index) + '/I' + str(property_index) + ',"")'
""" Calculation: Type column """
sheet.cell(row = property_index, column = 24).value = '=IF(F' + str(property_index) + '=0,"Studio",IF(F' + str(property_index) + '>=5,"5+",F' + str(property_index) + '))'
else:
print("Row", i, "- this is LAND FOR SALE")
dataset_index = dataset_index + 1
""" SAVE DATA TO EXCEL """
import datetime
from datetime import date
timeNow = str(datetime.datetime.now()).replace(":","")[:-10].replace(' ', '')
wb.save("File_After_SALES - " + area_value + "_" + timeNow + ".xlsx")
print('Finished.')
return None
updateExcel(DF, area_value)
# ------------------------- Update EXCEL file : end ---------------------------
#------------------------------------------------------------------------------
"""
!!! Part below is for EXPERIMENTING !!!
"""
# ::: -------------------------- XGBOOST ---------------------------------- ###
# ::: Ready to train! :::
import xgboost as xgb
# ::: Set-up XGBoost classifier :::
classifier = xgb.sklearn.XGBClassifier(nthread = -1, seed = 42)
# ::: Train the model with parameters :::
classifier.fit(X_train, y_train)
# ::: Evaluation :::
predictions = classifier.predict(X_test)
#------------------------------------------------------------------------------
# ::: Handy XGBoost methods
%matplotlib inline
import matplotlib.pyplot as plt
# Plot Feature Importances >>>
plt.figure(figsize=(8,5))
xgb.plot_importance(classifier, ax = plt.gca())
# Plot Tress that were built by XGBoost
plt.figure(figsize=(8,5))
xgb.plot_tree(classifier, ax = plt.gca())
# Access the characteristics of the model
print('Number of boosting trees: {}.'.format(classifier.n_estimators))
print('Max depth of trees: {}.'.format(classifier.max_depth))
print('Objective function: {}.'.format(classifier.objective))
# -----------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment