Skip to content

Instantly share code, notes, and snippets.

@barnden
Created August 10, 2021 02:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barnden/31ebac6b9641b6a0e4589565b20921e6 to your computer and use it in GitHub Desktop.
Save barnden/31ebac6b9641b6a0e4589565b20921e6 to your computer and use it in GitHub Desktop.
def distance(pid, yr):
# Computes the Mahalanobis distance for a given player to all other player.
# Get player data
player = orig_data[(orig_data.playerID == pid) & (orig_data.yearID == yr)][cols]
sid = player.index.astype(int)[0]
print('Comparing: {} (id: {})'.format(pid, sid))
# Mask invalid values in the player vector
pvec = np.ma.masked_invalid(np.array(player))
min_player = None
min_val = None
for i in range(len(data)):
# Get the ith player season
cdata = data.iloc[i]
# Ignore the current player season
if cdata.name == sid:
continue
# Mask invalid values
cvec = np.ma.masked_invalid(np.array(cdata))
# Find difference between x and y
delta = pvec - cvec
# Find Mahalanobis distance
dist = np.sqrt(np.einsum('nj,jk,nk->n', delta, invcov, delta))[0]
# Check to see if current distance is smallest, if so, keep it.
if min_id == None or min_val > dist:
min_player = batting_data.iloc[i]
min_val = dist
# Print out the most similar season
print('Most similar: dist: {}\n{}'.format(min_val, min_player))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment