-
-
Save codecademydev/0524aabe1a129a445a3e75d8b8472257 to your computer and use it in GitHub Desktop.
Codecademy export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecademylib3_seaborn | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LinearRegression | |
# load and investigate the data here: | |
df = pd.read_csv("tennis_stats.csv") | |
print(df.head()) | |
print(df["Player"]) | |
print(df.columns) | |
print(df.describe()) | |
# perform exploratory analysis here: | |
#Note we are using data columns that helps us understand a player's records and not there opponent's. | |
print(df.corr()) | |
#From the correlation table, "Winnings" has the highest correlation in our data | |
plt.scatter(df['BreakPointsOpportunities'],df['Winnings']) | |
plt.title('BreakPointsOpportunities vs Winnings') | |
plt.xlabel('BreakPointsOpportunities') | |
plt.ylabel('Winnings') | |
plt.show() | |
plt.clf() | |
plt.scatter(df['FirstServeReturnPointsWon'],df['Winnings']) | |
plt.title('FirstServeReturnPointsWon vs Winnings') | |
plt.xlabel('FirstServeReturnPointsWon') | |
plt.ylabel('Winnings') | |
plt.show() | |
plt.clf() | |
plt.scatter(df['TotalPointsWon'],df['Ranking']) | |
plt.title('TotalPointsWon vs Ranking') | |
plt.xlabel('TotalPointsWon') | |
plt.ylabel('Ranking') | |
plt.show() | |
plt.clf() | |
plt.scatter(df['TotalServicePointsWon'],df['Wins']) | |
plt.title('TotalServicePointsWon vs Wins') | |
plt.xlabel('TotalServicePointsWon') | |
plt.ylabel('Wins') | |
plt.show() | |
plt.clf() | |
""" | |
From our plot we can see that the BreakPointsOpportunities correlates well with our Winnings data and it's best column that explains our Player's record and not the opponent's. | |
""" | |
## perform single feature linear regressions here: | |
#Setting the features and values to predict | |
features = df[['FirstServeReturnPointsWon']] | |
outcome = df[['Winnings']] | |
#Splitting to get my Train and Test values for features and outcome | |
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8) | |
#Training my model on Training data | |
model = LinearRegression() | |
model.fit(features_train,winning_train) | |
#Understanding the accuracy of your model | |
print(model.score(features_test,winning_test)) | |
""" | |
The FirstServeReturnPointsWon Test Score was 0.07210859488135313 | |
""" | |
#Predicting Winnings using our model | |
winning_prediction=model.predict(features_test) | |
#Ploting Winning predictions against Actual winnings | |
plt.scatter(winning_test,winning_prediction, alpha=0.4) | |
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature') | |
plt.xlabel('Actual Winnings') | |
plt.ylabel('Predicted Winnings') | |
plt.show() | |
plt.clf() | |
#Single feature using 'BreakPointsOpportunities' | |
features = df[['BreakPointsOpportunities']] | |
outcome = df[['Winnings']] | |
#Splitting to get my Train and Test values for features and outcome | |
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8) | |
#Training my model on Training data | |
model = LinearRegression() | |
model.fit(features_train,winning_train) | |
#Understanding the accuracy of your model | |
print("The BreakPointsOpportunities test score is: ",model.score(features_test,winning_test)) | |
""" | |
The BreakPointsOpportunities Test Score was 0.776373293125185 | |
""" | |
#Predicting Winnings using our model | |
winning_prediction=model.predict(features_test) | |
#Ploting Winning predictions against Actual winnings | |
plt.scatter(winning_test,winning_prediction, alpha=0.4) | |
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature') | |
plt.xlabel('Actual Winnings') | |
plt.ylabel('Predicted Winnings') | |
plt.show() | |
plt.clf() | |
## perform two feature linear regressions here: | |
#Two feature using 'BreakPointsOpportunities' and 'FirstServeReturnPointsWon' | |
features = df[['BreakPointsOpportunities', 'FirstServeReturnPointsWon']] | |
outcome = df[['Winnings']] | |
#Splitting to get my Train and Test values for features and outcome | |
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8) | |
#Training my model on Training data | |
model = LinearRegression() | |
model.fit(features_train,winning_train) | |
#Understanding the accuracy of your model | |
print("The BreakPointsOpportunities and FirstServeReturnPointsWon test score is: ",model.score(features_test,winning_test)) | |
""" | |
The 'BreakPointsOpportunities' and 'FirstServeReturnPointsWon' Test Score was 0.7969221844119052 | |
""" | |
#Predicting Winnings using our model | |
winning_prediction=model.predict(features_test) | |
#Ploting Winning predictions against Actual winnings | |
plt.scatter(winning_test,winning_prediction, alpha=0.4) | |
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature') | |
plt.xlabel('Actual Winnings') | |
plt.ylabel('Predicted Winnings') | |
plt.show() | |
plt.clf() | |
## perform multiple feature linear regressions here: | |
features = df[['FirstServe','FirstServePointsWon','FirstServeReturnPointsWon','SecondServePointsWon','SecondServeReturnPointsWon','Aces','BreakPointsConverted','BreakPointsFaced','BreakPointsOpportunities','BreakPointsSaved','DoubleFaults','ReturnGamesPlayed','ReturnGamesWon','ReturnPointsWon','ServiceGamesPlayed','ServiceGamesWon','TotalPointsWon','TotalServicePointsWon']] | |
outcome = df[['Winnings']] | |
#Splitting to get my Train and Test values for features and outcome | |
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8) | |
#Training my model on Training data | |
model = LinearRegression() | |
model.fit(features_train,winning_train) | |
#Understanding the accuracy of your model | |
print("The Multiple feature test score is: ",model.score(features_test,winning_test)) | |
""" | |
The Multiple Test Score was 0.8273217996303851 | |
""" | |
#Predicting Winnings using our model | |
winning_prediction=model.predict(features_test) | |
#Ploting Winning predictions against Actual winnings | |
plt.scatter(winning_test,winning_prediction, alpha=0.4) | |
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature') | |
plt.xlabel('Actual Winnings') | |
plt.ylabel('Predicted Winnings') | |
plt.show() | |
plt.clf() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment