Skip to content

Instantly share code, notes, and snippets.

@ShahStavan
Last active September 29, 2023 19:23
Show Gist options
  • Save ShahStavan/4f841c7f1a634690c2007c4da3ecab53 to your computer and use it in GitHub Desktop.
Save ShahStavan/4f841c7f1a634690c2007c4da3ecab53 to your computer and use it in GitHub Desktop.
Linear Regression with Regularization (without using sklearn or equivalent library) and Simple and Multiple Linear Regression with and without regularization using Sklearn
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPA840Ri0F010krgnO0YHXL",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ShahStavan/4f841c7f1a634690c2007c4da3ecab53/22bce539_practical_4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qi4S_bZFIXsb"
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"from sklearn.preprocessing import StandardScaler\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.linear_model import SGDRegressor\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"source": [
"# Define the URL of the dataset\n",
"data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n",
"\n",
"# Read the dataset with proper column names and separator\n",
"column_names = [\n",
" \"CRIM\", \"ZN\", \"INDUS\", \"CHAS\", \"NOX\", \"RM\", \"AGE\", \"DIS\", \"RAD\", \"TAX\", \"PTRATIO\", \"B\", \"LSTAT\", \"MEDV\"\n",
"]\n",
"raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None, names=column_names)\n"
],
"metadata": {
"id": "iuW56j49IhWs"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"housing = datasets.fetch_california_housing()\n",
"data=housing.data\n",
"data.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hEwpvMyEImyn",
"outputId": "bbc79adb-00d0-44af-9ccb-20b4546a459f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(20640, 8)"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"source": [
"# Split data into train and test sets\n",
"x_train1=data[:15000,0:7]\n",
"x_test1=data[15000:,0:7]\n",
"y_train=data[:15000,7]\n",
"y_test=data[15000:,7]\n",
"\n",
"# Add bias term\n",
"x_train = np.ones((x_train1.shape[0], x_train1.shape[1] + 1))\n",
"x_train[:, 1:] = x_train1\n",
"\n",
"x_test = np.ones((x_test1.shape[0], x_test1.shape[1] + 1))\n",
"x_test[:, 1:] = x_test1\n",
"\n",
"# Standardize features\n",
"scaler = StandardScaler()\n",
"scaler.fit(x_train[:, 1:])\n",
"x_train[:, 1:] = scaler.transform(x_train[:, 1:])\n",
"x_test[:, 1:] = scaler.transform(x_test[:, 1:])\n",
"\n",
"# Initialize theta with random values\n",
"theta = np.random.uniform(0, 1, size=x_train.shape[1])\n"
],
"metadata": {
"id": "IoPd5b__I3fi"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Hyperparameters\n",
"n_iterations = 1000\n",
"learning_rate = 0.01\n",
"\n",
"# Number of training examples (m) and features (n)\n",
"m=x_train.shape[0]\n",
"n=x_train.shape[1]\n",
"\n",
"# Initialize a list to store the cost history\n",
"cost_history = []\n",
"\n",
"# Gradient Descent\n",
"for iteration in range(n_iterations):\n",
" # Initialize gradient vector\n",
" gradient = np.zeros(x_train.shape[1])\n",
"\n",
" # Calculate predictions\n",
" predictions = np.dot(x_train, theta)\n",
"\n",
" # Calculate error\n",
" errors = predictions - y_train\n",
" s=0\n",
"\n",
" # Calculate cost (mean squared error)\n",
" for j in range(errors.size):\n",
" s=s+errors[j]*errors[j]\n",
" cost_history.append((1/2*m) * s)\n",
"\n",
" # Calculate gradients\n",
" for j in range(n):\n",
" gradient[j]=np.sum(errors*(x_train.T)[j])\n",
"\n",
"\n",
" # Update theta using gradient descent\n",
" theta -= (learning_rate / m) * gradient\n"
],
"metadata": {
"id": "fycoLZkPJVac"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"iteration=np.arange(1,n_iterations+1)\n",
"plt.plot(iteration,cost_history)\n",
"plt.title(\"Cost function curve\")\n",
"plt.xlabel(\"Iterations\")\n",
"plt.ylabel(\"Cost\")\n",
"plt.show()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"id": "u5gWWRSGKiX7",
"outputId": "5497be60-0935-4619-c625-24a444371f66"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"print(\"Theta : \",theta,\"\\n\\n \",theta.shape)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "83F6Tn46N1G9",
"outputId": "50becdd8-9ee0-4ee0-e777-cde04981db46"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Theta : [-1.19041950e+02 -2.98013845e-01 -2.18830584e-01 4.51273747e-01\n",
" -2.67761206e-01 -5.13885088e-02 3.41610593e-02 -1.79038326e+00] \n",
"\n",
" (8,)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"predictions=np.dot(x_test,theta)\n",
"predictions"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mdB5eNedN5n1",
"outputId": "b7156a07-ed34-4dd3-a294-327647135505"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([-117.01075092, -116.58555421, -116.39039419, ..., -122.23571743,\n",
" -122.27462559, -122.30227714])"
]
},
"metadata": {},
"execution_count": 37
}
]
},
{
"cell_type": "code",
"source": [
"MAE=mean_absolute_error(y_true=y_test,y_pred=predictions)\n",
"MSE=mean_squared_error(y_true=y_test,y_pred=predictions)\n",
"print(\"MAE = \",MAE,\"\\nMSE = \",MSE)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pCFBoVANN5m1",
"outputId": "5018909e-6574-4956-b51f-40273bf59260"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"MAE = 0.7512238841177775 \n",
"MSE = 0.8094834008389342\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### 2. Multiple Linear Regression using Normal Equation"
],
"metadata": {
"id": "8Thex4jaODVW"
}
},
{
"cell_type": "code",
"source": [
"\n",
"# Calculate the coefficients (theta) using the Normal Equation\n",
"x_train_transpose = np.transpose(x_train)\n",
"xx = np.dot(x_train_transpose, x_train)\n",
"xx_inv = np.linalg.inv(xx)\n",
"theta = np.dot(np.dot(xx_inv, x_train_transpose), y_train)\n",
"\n",
"# Make predictions using the calculated coefficients\n",
"predictions = np.dot(x_test, theta)\n",
"\n",
"# Calculate Mean Absolute Error (MAE) and Mean Squared Error (MSE)\n",
"MAE = mean_absolute_error(y_true=y_test, y_pred=predictions)\n",
"MSE = mean_squared_error(y_true=y_test, y_pred=predictions)\n",
"\n",
"# Print the results\n",
"print(\"Theta:\\n\", theta)\n",
"print(\"\\nMAE:\", MAE)\n",
"print(\"MSE:\", MSE)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fD6VihfsN-mx",
"outputId": "cc0448d8-2bd5-4bbc-f7cd-a8ce4bf5ab7c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Theta:\n",
" [-1.19041950e+02 -3.12839097e-01 -2.17991470e-01 4.90445461e-01\n",
" -3.04269042e-01 -5.08202119e-02 3.40663449e-02 -1.79408497e+00]\n",
"\n",
"MAE: 0.7485046322773274\n",
"MSE: 0.8055934931615918\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Simple Linear Regression using Gradient descent with inbuilt library"
],
"metadata": {
"id": "Ou7wrZgdOrCC"
}
},
{
"cell_type": "code",
"source": [
"\n",
"# Load data from the given URL\n",
"data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n",
"raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None)\n",
"\n",
"# Remove rows with missing values\n",
"df = raw_df.dropna()\n",
"\n",
"# Extract features (x) and target (y)\n",
"data = np.array(df)\n",
"x = data[:, 0:10]\n",
"y = data[:, 10]\n",
"\n",
"# Standardize the features\n",
"scalar = StandardScaler()\n",
"x_scaled = scalar.fit_transform(x)\n",
"\n",
"# Split the data into training and testing sets\n",
"x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.8, shuffle=True)\n",
"\n",
"# Create and train the SGDRegressor model\n",
"model = SGDRegressor(max_iter=1000, alpha=0.01)\n",
"model.fit(x_train, y_train)\n",
"\n",
"# Get the model parameters (coefficients)\n",
"parameters = model.coef_\n",
"print(\"Model Coefficients (Theta_1 to Theta_10):\", parameters)\n",
"\n",
"# Get the intercept (Theta_0)\n",
"theta0 = model.intercept_\n",
"print(\"Intercept (Theta_0):\", theta0)\n",
"\n",
"# Make predictions on the test data\n",
"predictions = model.predict(x_test)\n",
"\n",
"# Calculate and print the mean absolute error\n",
"absolute_error = mean_absolute_error(y_test, predictions)\n",
"print(\"Mean Absolute Error (MAE):\", absolute_error)\n",
"\n",
"# Calculate and print the mean squared error\n",
"mean_error = mean_squared_error(y_test, predictions)\n",
"print(\"Mean Squared Error (MSE):\", mean_error)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "G7WTKWloOp68",
"outputId": "13ff8238-949b-4ebd-b210-298318da758f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Model Coefficients (Theta_1 to Theta_10): [ 0.03183344 -0.90938062 0.51084593 -0.15521076 -1.14944335 -0.37306988\n",
" 0.36059199 0.26946265 0.59068271 0.43558996]\n",
"Intercept (Theta_0): [18.46626708]\n",
"Mean Absolute Error (MAE): 1.2805017931292624\n",
"Mean Squared Error (MSE): 2.7411612080195584\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment