Skip to content

Instantly share code, notes, and snippets.

@howardhamilton
Last active November 9, 2017 17:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save howardhamilton/87590b2e824d76d01f6d to your computer and use it in GitHub Desktop.
Save howardhamilton/87590b2e824d76d01f6d to your computer and use it in GitHub Desktop.
Identify N nearest neighbors to player's season statistics
"""
An outline for identifying N players with similar summary statistics to a player of interest.
A list of candidate players is compiled by filtering on position and end-of-season age.
Summary statistics are scaled to z-scores, which are the inputs to the machine learning model.
This algorithm uses K-Nearest Neighbor, but other algorithms (eg K-Means Clustering) can be substituted.
(c) 2015 Soccermetrics Research LLC
This code is licensed under the terms of the MIT License (http://choosealicense.com/licenses/mit/)
"""
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cosine
import numpy as np
def get_statistical_fields(position):
"""
Compile list of statistical categories to be retrieved from database
given player position.
"""
# metrics consistent among all players
all_metric_list = ('games_started', 'games_subbed', 'minutes', 'yellows', 'reds')
# metrics for field players
field_metric_list = ('goals_total', 'goals_headed', 'goals_freekick', 'goals_in_area',
'goals_out_area', 'goals_penalty', 'penalties', 'winners', 'assists',
'deadball_assists', 'shots', 'fouls')
# metrics for goalkeepers
gk_metric_list = ('wins', 'draws', 'losses', 'goals_allowed', 'shutouts', 'shots_allowed')
if position != POSITION['Goalkeeper']:
stat_table = 'field_stats_list'
metrics = all_metric_list + field_metric_list
else:
stat_table = 'goalkeeper_stats_list'
metrics = all_metric_list + gk_metric_list
def identify_similar_player(player, competition, season, N=20):
"""
Identify N nearest neighbors to player's statistical performance
controlling for players of same age and position
"""
def mydist(x,y):
# internal function
# compute cosine distance between two points
return cosine(x,y)
# retrieve player position and age
player_position, player_age = get_position_and_age(player, season)
# get statistical fields that are relevant to player position
# field players and goalkeepers have statistical categories unique to them
stat_table, stat_metrics = get_statistical_fields(player_position)
# retrieve player z-scores for each metric --> player statistical record
# some metrics do not exist for player --> these are zeroed out
player_record = get_statistical_record(player, competition, season, metrics)
# retrieve candidate list of players who match position and age
# player, competition, season IDs
candidate_player_list = create_candidate_list(stat_table, player_position, player_age)
# create candidate player statistical record --> feature matrix
training_list = [
get_statistical_record(candidate_player, candidate_competition, candidate_season, metrics)
for candidate_player, candidate_competition, candidate_season in candidate_player_list
]
# train a K-Nearest Neighbor model on candidate player statistical records
# N neighbors, BallTree algorithm, cosine similarity distance
# for more details see http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
X = np.array(training_list)
nbrs = NearestNeighbors(n_neighbors=N, algorithm='ball_tree', metric='pyfunc', func=mydist).fit(X)
# find the K-neighbors to the player's statistical record
distances, indices = nbrs.kneighbors(player_record)
return distances, indices
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment