Skip to content

Instantly share code, notes, and snippets.

View andreachello's full-sized avatar
:electron:
Building

Andrea Chello andreachello

:electron:
Building
View GitHub Profile
# THIS SCRIPT USES THE LIBRARY AT:
# https://github.com/hzeller/rpi-rgb-led-matrix
# BE SURE TO CLONE IT AND READ THE README, as highlighted in the video :)
import os, time, threading, random
import feedparser
from PIL import Image, ImageFont, ImageDraw
from random import shuffle
@andreachello
andreachello / Data Import
Last active October 6, 2021 02:31
Supervised Machine Learning: Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Import the data using the file path
data = pd.read_csv('Ames_Housing_Sales.csv')
df = data.copy()
# Get a Pd.Series consisting of all the string categoricals
one_hot_encode_cols = data.dtypes[data.dtypes == object] # filtering by string categoricals
one_hot_encode_cols = one_hot_encode_cols.index.tolist() # list of categorical fields
# Encode these columns as categoricals so one hot encoding works on split data (if desired)
for col in one_hot_encode_cols:
data[col] = pd.Categorical(data[col])
# Do the one hot encoding
data = pd.get_dummies(data, columns=one_hot_encode_cols, drop_first=True)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
lr = LinearRegression()
y_s_col = "SalePrice"
X_s = data.drop(y_s_col, axis=1)
y_s = data[y_s_col]
# Create a list of float colums to check for skewing
mask = df.dtypes == float
float_cols = df.columns[mask]
skew_limit = 0.75 # define a limit above which we will log transform
skew_vals = df[float_cols].skew()
# Showing the skewed columns
skew_cols = (skew_vals
.sort_values(ascending=False)
from sklearn.model_selection import train_test_split
y_col = 'SalePrice'
# Split the data that is one-hot encoded
feature_cols = [x for x in data_ohc.columns if x != y_col]
X_data_ohc = data_ohc[feature_cols]
y_data_ohc = data_ohc[y_col]
X_train_ohc, X_test_ohc, y_train_ohc, y_test_ohc = train_test_split(X_data_ohc, y_data_ohc,
test_size=0.3, random_state=42)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
LR = LinearRegression()
# Data that have been one-hot encoded
LR = LR.fit(X_train_ohc, y_train_ohc)
y_train_ohc_pred = LR.predict(X_train_ohc)
y_test_ohc_pred = LR.predict(X_test_ohc)
plt.figure(figsize=(12,5))
sns.set_context('talk')
sns.set_style('ticks')
sns.set_palette('dark')
ax = plt.axes()
# we are going to use y_test_ohc, y_test_ohc_pred
ax.scatter(y_test_ohc, y_test_ohc_pred, alpha=.5)
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
kf = KFold(shuffle=True, random_state=72018, n_splits=3)
X = data.drop('SalePrice', axis=1)
y = data.SalePrice
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
# root-mean-squared error function
def rmse(ytrue, ypredicted):
return np.sqrt(mean_squared_error(ytrue, ypredicted))
alphas = [0.005, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 80]
ridgeCV = RidgeCV(alphas=alphas,