Last active
February 27, 2019 16:47
Python plotting of "Vinho Verde" red wine dataset for linear regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn import linear_model | |
# URL for the Wine Quality Portuguese "Vinho Verde" red wine dataset (UCI Machine Learning Repository) | |
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" | |
# download the file | |
try: | |
import urllib.request | |
raw_data = urllib.request.urlopen(url) | |
except ImportError: | |
import urllib | |
raw_data = urllib.urlopen(url) | |
data_names= ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality (0-10)'] | |
# load the CSV file as a numpy matrix | |
data = pd.read_csv(raw_data, sep=";", header=None, skiprows=1) | |
data.columns = data_names | |
# Create linear regression object | |
regr = linear_model.LinearRegression() | |
x_data = data['alcohol'].values.reshape(-1,1) | |
y_data = data['quality (0-10)'].values.reshape(-1,1) | |
# once the data is reshaped, running the fit is simple | |
regr.fit(x_data, y_data) | |
plt.clf() | |
plt.figure(figsize = (10, 6)) | |
plt.title('Alcohol vs Quality') | |
plt.xlabel(data_names[10]) | |
plt.ylabel(data_names[11]) | |
plt.scatter(data['alcohol'].values, data['quality (0-10)'].values) | |
plt.legend() | |
# Plot the data and the fit for the linear regresssion | |
plt.plot(x_data, regr.predict(x_data), color='black', linewidth=3) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment