Skip to content

Instantly share code, notes, and snippets.

@barangerbenjamin
Created August 24, 2021 10:16
Show Gist options
  • Save barangerbenjamin/b45ff24f22105692cea971850c564a11 to your computer and use it in GitHub Desktop.
Save barangerbenjamin/b45ff24f22105692cea971850c564a11 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "first_ml_model.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Z6zFo4c3B5aE"
},
"source": [
"# Import pandas to manipulate dataset"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jfgTPhdu7TwH"
},
"source": [
"import pandas as pd"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "WTEJ8SggB9_7"
},
"source": [
"# Load default data existing on Google Collab"
]
},
{
"cell_type": "code",
"metadata": {
"id": "1Xldpj7T7VFq"
},
"source": [
"df = pd.read_csv('sample_data/california_housing_train.csv')"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "pC83vaflCFtU"
},
"source": [
"# View sample of data"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"id": "uQHZqoWP8q1a",
"outputId": "6c34a8d1-3e1b-4cee-8e67-7a34be78ff88"
},
"source": [
"df.head()"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-114.31</td>\n",
" <td>34.19</td>\n",
" <td>15.0</td>\n",
" <td>5612.0</td>\n",
" <td>1283.0</td>\n",
" <td>1015.0</td>\n",
" <td>472.0</td>\n",
" <td>1.4936</td>\n",
" <td>66900.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-114.47</td>\n",
" <td>34.40</td>\n",
" <td>19.0</td>\n",
" <td>7650.0</td>\n",
" <td>1901.0</td>\n",
" <td>1129.0</td>\n",
" <td>463.0</td>\n",
" <td>1.8200</td>\n",
" <td>80100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-114.56</td>\n",
" <td>33.69</td>\n",
" <td>17.0</td>\n",
" <td>720.0</td>\n",
" <td>174.0</td>\n",
" <td>333.0</td>\n",
" <td>117.0</td>\n",
" <td>1.6509</td>\n",
" <td>85700.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-114.57</td>\n",
" <td>33.64</td>\n",
" <td>14.0</td>\n",
" <td>1501.0</td>\n",
" <td>337.0</td>\n",
" <td>515.0</td>\n",
" <td>226.0</td>\n",
" <td>3.1917</td>\n",
" <td>73400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-114.57</td>\n",
" <td>33.57</td>\n",
" <td>20.0</td>\n",
" <td>1454.0</td>\n",
" <td>326.0</td>\n",
" <td>624.0</td>\n",
" <td>262.0</td>\n",
" <td>1.9250</td>\n",
" <td>65500.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude ... median_income median_house_value\n",
"0 -114.31 34.19 ... 1.4936 66900.0\n",
"1 -114.47 34.40 ... 1.8200 80100.0\n",
"2 -114.56 33.69 ... 1.6509 85700.0\n",
"3 -114.57 33.64 ... 3.1917 73400.0\n",
"4 -114.57 33.57 ... 1.9250 65500.0\n",
"\n",
"[5 rows x 9 columns]"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "b2-yiqQDCKP6"
},
"source": [
"# Inspect quality data"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CPtCbQUw8Kk0",
"outputId": "236c2c11-8436-4dcf-c348-b6d118e656d3"
},
"source": [
"df.info()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 17000 entries, 0 to 16999\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 longitude 17000 non-null float64\n",
" 1 latitude 17000 non-null float64\n",
" 2 housing_median_age 17000 non-null float64\n",
" 3 total_rooms 17000 non-null float64\n",
" 4 total_bedrooms 17000 non-null float64\n",
" 5 population 17000 non-null float64\n",
" 6 households 17000 non-null float64\n",
" 7 median_income 17000 non-null float64\n",
" 8 median_house_value 17000 non-null float64\n",
"dtypes: float64(9)\n",
"memory usage: 1.2 MB\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 317
},
"id": "p4GPylAv8lAz",
"outputId": "2bb8e909-eac6-4732-edaa-b79751845ae8"
},
"source": [
"df.describe()"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" <td>17000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>-119.562108</td>\n",
" <td>35.625225</td>\n",
" <td>28.589353</td>\n",
" <td>2643.664412</td>\n",
" <td>539.410824</td>\n",
" <td>1429.573941</td>\n",
" <td>501.221941</td>\n",
" <td>3.883578</td>\n",
" <td>207300.912353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2.005166</td>\n",
" <td>2.137340</td>\n",
" <td>12.586937</td>\n",
" <td>2179.947071</td>\n",
" <td>421.499452</td>\n",
" <td>1147.852959</td>\n",
" <td>384.520841</td>\n",
" <td>1.908157</td>\n",
" <td>115983.764387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-124.350000</td>\n",
" <td>32.540000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.499900</td>\n",
" <td>14999.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>-121.790000</td>\n",
" <td>33.930000</td>\n",
" <td>18.000000</td>\n",
" <td>1462.000000</td>\n",
" <td>297.000000</td>\n",
" <td>790.000000</td>\n",
" <td>282.000000</td>\n",
" <td>2.566375</td>\n",
" <td>119400.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>-118.490000</td>\n",
" <td>34.250000</td>\n",
" <td>29.000000</td>\n",
" <td>2127.000000</td>\n",
" <td>434.000000</td>\n",
" <td>1167.000000</td>\n",
" <td>409.000000</td>\n",
" <td>3.544600</td>\n",
" <td>180400.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>-118.000000</td>\n",
" <td>37.720000</td>\n",
" <td>37.000000</td>\n",
" <td>3151.250000</td>\n",
" <td>648.250000</td>\n",
" <td>1721.000000</td>\n",
" <td>605.250000</td>\n",
" <td>4.767000</td>\n",
" <td>265000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>-114.310000</td>\n",
" <td>41.950000</td>\n",
" <td>52.000000</td>\n",
" <td>37937.000000</td>\n",
" <td>6445.000000</td>\n",
" <td>35682.000000</td>\n",
" <td>6082.000000</td>\n",
" <td>15.000100</td>\n",
" <td>500001.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude ... median_income median_house_value\n",
"count 17000.000000 17000.000000 ... 17000.000000 17000.000000\n",
"mean -119.562108 35.625225 ... 3.883578 207300.912353\n",
"std 2.005166 2.137340 ... 1.908157 115983.764387\n",
"min -124.350000 32.540000 ... 0.499900 14999.000000\n",
"25% -121.790000 33.930000 ... 2.566375 119400.000000\n",
"50% -118.490000 34.250000 ... 3.544600 180400.000000\n",
"75% -118.000000 37.720000 ... 4.767000 265000.000000\n",
"max -114.310000 41.950000 ... 15.000100 500001.000000\n",
"\n",
"[8 rows x 9 columns]"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YivyO71A-SA2"
},
"source": [
"# Is there obvious correlation between target & features"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 382
},
"id": "FwdV0-h48ota",
"outputId": "94e85dea-a2a1-4429-84d4-2fa85c8dc466"
},
"source": [
"import seaborn as sns\n",
"\n",
"sns.heatmap(df.corr())"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7efc4ab47a90>"
]
},
"metadata": {},
"execution_count": 8
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fbMbqX5BCWr1"
},
"source": [
"# Example of potential linear regression between feature & target"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 297
},
"id": "ayuyA5Zq8-Ej",
"outputId": "e4929002-f12d-410f-e06b-fde641d303ce"
},
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.scatter(df['total_rooms'], df['population'])\n",
"plt.ylabel('total_rooms')\n",
"plt.xlabel('median_house_price')"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 0, 'median_house_price')"
]
},
"metadata": {},
"execution_count": 9
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YZ4YWgF1CuqZ"
},
"source": [
"# Separate our features and target"
]
},
{
"cell_type": "code",
"metadata": {
"id": "GZ3gO8PY9rRu"
},
"source": [
"X = df.drop('median_house_value', axis=1)\n",
"y = df['median_house_value']"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "nYY-_yuSC3qb"
},
"source": [
"# Split dataset, 70% for training model, 30% to evaluate it"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4Y_JPQBS-ufw"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sGxiCMOK-0PX",
"outputId": "ff487afd-3bec-4c05-9a9f-008f455b881b"
},
"source": [
"X_train.shape, y_train.shape"
],
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"((11900, 8), (11900,))"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SwDM4tj5DDCk"
},
"source": [
"# Load LinearRegression class"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jdSVU8T9_Asn"
},
"source": [
"from sklearn.linear_model import LinearRegression"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "59goDRfdDMRE"
},
"source": [
"## Instantiate it"
]
},
{
"cell_type": "code",
"metadata": {
"id": "gZ84dOPA_JtI"
},
"source": [
"lin_reg = LinearRegression()"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "VOoYxrZeDQ-_"
},
"source": [
"## Train our model with our train data"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BT1VsDyW_MXt",
"outputId": "42556605-8f26-4d29-b1c8-a8bbcede2223"
},
"source": [
"lin_reg.fit(X_train, y_train)"
],
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nA8yxLTmDVZP"
},
"source": [
"## Evaluate model score"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9RY3Yw8a_R-Q",
"outputId": "4f34677f-9431-4622-c439-36380a46b425"
},
"source": [
"lin_reg.score(X_test, y_test)"
],
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.6226125236333266"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "k4hGg1_pDZkW"
},
"source": [
"## How to make a prediction"
]
},
{
"cell_type": "code",
"metadata": {
"id": "EwWHe8R__Zwd"
},
"source": [
"new_block = [[-119.770000,\t36.740000,\t50.000000,\t1325.000000,\t280.000000,\t811.000000,\t281.000000,\t1.866700]]"
],
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5kfW94B_AUne"
},
"source": [
"new_block_df = pd.DataFrame(new_block, columns=list(X.keys()))"
],
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 80
},
"id": "gcPoyMhPBNEk",
"outputId": "b6897155-43e3-436d-bd0c-8392455ae47d"
},
"source": [
"new_block_df"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-119.77</td>\n",
" <td>36.74</td>\n",
" <td>50.0</td>\n",
" <td>1325.0</td>\n",
" <td>280.0</td>\n",
" <td>811.0</td>\n",
" <td>281.0</td>\n",
" <td>1.8667</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude ... households median_income\n",
"0 -119.77 36.74 ... 281.0 1.8667\n",
"\n",
"[1 rows x 8 columns]"
]
},
"metadata": {},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aoomcdzb_1Wa",
"outputId": "bc2fc8c6-219c-487c-8619-13f5f5e99985"
},
"source": [
"lin_reg.predict(new_block)"
],
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([106314.4523928])"
]
},
"metadata": {},
"execution_count": 20
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment