Skip to content

Instantly share code, notes, and snippets.

@codecademydev
Created June 10, 2020 01:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save codecademydev/0524aabe1a129a445a3e75d8b8472257 to your computer and use it in GitHub Desktop.
Save codecademydev/0524aabe1a129a445a3e75d8b8472257 to your computer and use it in GitHub Desktop.
Codecademy export
import codecademylib3_seaborn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# load and investigate the data here:
df = pd.read_csv("tennis_stats.csv")
print(df.head())
print(df["Player"])
print(df.columns)
print(df.describe())
# perform exploratory analysis here:
#Note we are using data columns that helps us understand a player's records and not there opponent's.
print(df.corr())
#From the correlation table, "Winnings" has the highest correlation in our data
plt.scatter(df['BreakPointsOpportunities'],df['Winnings'])
plt.title('BreakPointsOpportunities vs Winnings')
plt.xlabel('BreakPointsOpportunities')
plt.ylabel('Winnings')
plt.show()
plt.clf()
plt.scatter(df['FirstServeReturnPointsWon'],df['Winnings'])
plt.title('FirstServeReturnPointsWon vs Winnings')
plt.xlabel('FirstServeReturnPointsWon')
plt.ylabel('Winnings')
plt.show()
plt.clf()
plt.scatter(df['TotalPointsWon'],df['Ranking'])
plt.title('TotalPointsWon vs Ranking')
plt.xlabel('TotalPointsWon')
plt.ylabel('Ranking')
plt.show()
plt.clf()
plt.scatter(df['TotalServicePointsWon'],df['Wins'])
plt.title('TotalServicePointsWon vs Wins')
plt.xlabel('TotalServicePointsWon')
plt.ylabel('Wins')
plt.show()
plt.clf()
"""
From our plot we can see that the BreakPointsOpportunities correlates well with our Winnings data and it's best column that explains our Player's record and not the opponent's.
"""
## perform single feature linear regressions here:
#Setting the features and values to predict
features = df[['FirstServeReturnPointsWon']]
outcome = df[['Winnings']]
#Splitting to get my Train and Test values for features and outcome
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8)
#Training my model on Training data
model = LinearRegression()
model.fit(features_train,winning_train)
#Understanding the accuracy of your model
print(model.score(features_test,winning_test))
"""
The FirstServeReturnPointsWon Test Score was 0.07210859488135313
"""
#Predicting Winnings using our model
winning_prediction=model.predict(features_test)
#Ploting Winning predictions against Actual winnings
plt.scatter(winning_test,winning_prediction, alpha=0.4)
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature')
plt.xlabel('Actual Winnings')
plt.ylabel('Predicted Winnings')
plt.show()
plt.clf()
#Single feature using 'BreakPointsOpportunities'
features = df[['BreakPointsOpportunities']]
outcome = df[['Winnings']]
#Splitting to get my Train and Test values for features and outcome
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8)
#Training my model on Training data
model = LinearRegression()
model.fit(features_train,winning_train)
#Understanding the accuracy of your model
print("The BreakPointsOpportunities test score is: ",model.score(features_test,winning_test))
"""
The BreakPointsOpportunities Test Score was 0.776373293125185
"""
#Predicting Winnings using our model
winning_prediction=model.predict(features_test)
#Ploting Winning predictions against Actual winnings
plt.scatter(winning_test,winning_prediction, alpha=0.4)
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature')
plt.xlabel('Actual Winnings')
plt.ylabel('Predicted Winnings')
plt.show()
plt.clf()
## perform two feature linear regressions here:
#Two feature using 'BreakPointsOpportunities' and 'FirstServeReturnPointsWon'
features = df[['BreakPointsOpportunities', 'FirstServeReturnPointsWon']]
outcome = df[['Winnings']]
#Splitting to get my Train and Test values for features and outcome
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8)
#Training my model on Training data
model = LinearRegression()
model.fit(features_train,winning_train)
#Understanding the accuracy of your model
print("The BreakPointsOpportunities and FirstServeReturnPointsWon test score is: ",model.score(features_test,winning_test))
"""
The 'BreakPointsOpportunities' and 'FirstServeReturnPointsWon' Test Score was 0.7969221844119052
"""
#Predicting Winnings using our model
winning_prediction=model.predict(features_test)
#Ploting Winning predictions against Actual winnings
plt.scatter(winning_test,winning_prediction, alpha=0.4)
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature')
plt.xlabel('Actual Winnings')
plt.ylabel('Predicted Winnings')
plt.show()
plt.clf()
## perform multiple feature linear regressions here:
features = df[['FirstServe','FirstServePointsWon','FirstServeReturnPointsWon','SecondServePointsWon','SecondServeReturnPointsWon','Aces','BreakPointsConverted','BreakPointsFaced','BreakPointsOpportunities','BreakPointsSaved','DoubleFaults','ReturnGamesPlayed','ReturnGamesWon','ReturnPointsWon','ServiceGamesPlayed','ServiceGamesWon','TotalPointsWon','TotalServicePointsWon']]
outcome = df[['Winnings']]
#Splitting to get my Train and Test values for features and outcome
features_train, features_test, winning_train, winning_test = train_test_split(features, outcome, train_size = 0.8)
#Training my model on Training data
model = LinearRegression()
model.fit(features_train,winning_train)
#Understanding the accuracy of your model
print("The Multiple feature test score is: ",model.score(features_test,winning_test))
"""
The Multiple Test Score was 0.8273217996303851
"""
#Predicting Winnings using our model
winning_prediction=model.predict(features_test)
#Ploting Winning predictions against Actual winnings
plt.scatter(winning_test,winning_prediction, alpha=0.4)
plt.title('Predicted Winnings vs. Actual Winnings - 1 Feature')
plt.xlabel('Actual Winnings')
plt.ylabel('Predicted Winnings')
plt.show()
plt.clf()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment