haydenflinner/stats.py

## stats.py
import pandas as pd

from dataclasses import dataclass

@dataclass
class PositionSample:
    lap: int
    rider: str
    pos: int
Sample = PositionSample

# 5 riders, 4 lap race. Real data should reveal more noise / trends.
df = pd.DataFrame([
    Sample(1, 'a', 1),
    Sample(1, 'b', 2),
    Sample(1, 'c', 3),
    Sample(1, 'd', 4),
    Sample(1, 'e', 5),

    Sample(2, 'a', 1),
    Sample(2, 'b', 2),
    Sample(2, 'c', 3),
    Sample(2, 'e', 4),
    Sample(2, 'd', 5),  # <-- d and e traded places

    Sample(3, 'a', 1),
    Sample(3, 'd', 2),
    Sample(3, 'e', 3),
    Sample(3, 'c', 4),  # b took out c and himself. or c took out b and himself.
    Sample(3, 'b', 5),

    # Finishing lap unchanged from prior lap.
    Sample(4, 'a', 1),
    Sample(4, 'd', 2),
    Sample(4, 'e', 3),
    Sample(4, 'c', 4),
    Sample(4, 'b', 5),
])

# Our goal here is to find if there's a correlation between being near certain riders
# and changes in Position. For example, maybe a certain rider is known to ride
# a wide bike and so being just behind him means you have a below average
# chance of passing. To really weigh that you'd need to correct for speed with
# something like ELO or maybe just finishing position in the current race.
# Another example would be that having Jett Lawrence behind you is a recipe
# for losing one spot by the end of the lap.
# This may also reveal riders who have a tendency
# to put other riders far down the results sheet.

# For each lap sample, this is important info.
df = df.sort_values(by=["lap", "pos"])
df["rider_ahead"] = df.groupby("lap")["rider"].shift()
df["rider_behind"] = df.groupby("lap")["rider"].shift(-1)

# Group by "rider" and then use shift to get the previous lap's "pos"
df = df.sort_values(by=["rider", "lap"])
df["prev_pos"] = df.groupby("rider")["pos"].shift()
df["rider_ahead_last_lap"] = df.groupby("rider")["rider_ahead"].shift()

# display(df.sort_values(by=["lap", "pos"]))
# Drop rows where there is no previous lap. Not a big loss since first laps are especially hectic.
df = df.dropna(subset=["prev_pos"])

df["pos_change"] = df.pos - df.prev_pos

# Resetting index if needed
df = df.reset_index(drop=True)


df.sort_values(by=["lap", "pos"])

display(df.groupby("rider_ahead_last_lap")["pos_change"].max())

import plotly.express as px
px.box(df, x='rider_ahead_last_lap', y='pos_change')
	import pandas as pd

	from dataclasses import dataclass

	@dataclass
	class PositionSample:
	lap: int
	rider: str
	pos: int
	Sample = PositionSample

	# 5 riders, 4 lap race. Real data should reveal more noise / trends.
	df = pd.DataFrame([
	Sample(1, 'a', 1),
	Sample(1, 'b', 2),
	Sample(1, 'c', 3),
	Sample(1, 'd', 4),
	Sample(1, 'e', 5),

	Sample(2, 'a', 1),
	Sample(2, 'b', 2),
	Sample(2, 'c', 3),
	Sample(2, 'e', 4),
	Sample(2, 'd', 5), # <-- d and e traded places

	Sample(3, 'a', 1),
	Sample(3, 'd', 2),
	Sample(3, 'e', 3),
	Sample(3, 'c', 4), # b took out c and himself. or c took out b and himself.
	Sample(3, 'b', 5),

	# Finishing lap unchanged from prior lap.
	Sample(4, 'a', 1),
	Sample(4, 'd', 2),
	Sample(4, 'e', 3),
	Sample(4, 'c', 4),
	Sample(4, 'b', 5),
	])

	# Our goal here is to find if there's a correlation between being near certain riders
	# and changes in Position. For example, maybe a certain rider is known to ride
	# a wide bike and so being just behind him means you have a below average
	# chance of passing. To really weigh that you'd need to correct for speed with
	# something like ELO or maybe just finishing position in the current race.
	# Another example would be that having Jett Lawrence behind you is a recipe
	# for losing one spot by the end of the lap.
	# This may also reveal riders who have a tendency
	# to put other riders far down the results sheet.

	# For each lap sample, this is important info.
	df = df.sort_values(by=["lap", "pos"])
	df["rider_ahead"] = df.groupby("lap")["rider"].shift()
	df["rider_behind"] = df.groupby("lap")["rider"].shift(-1)

	# Group by "rider" and then use shift to get the previous lap's "pos"
	df = df.sort_values(by=["rider", "lap"])
	df["prev_pos"] = df.groupby("rider")["pos"].shift()
	df["rider_ahead_last_lap"] = df.groupby("rider")["rider_ahead"].shift()

	# display(df.sort_values(by=["lap", "pos"]))
	# Drop rows where there is no previous lap. Not a big loss since first laps are especially hectic.
	df = df.dropna(subset=["prev_pos"])

	df["pos_change"] = df.pos - df.prev_pos

	# Resetting index if needed
	df = df.reset_index(drop=True)


	df.sort_values(by=["lap", "pos"])

	display(df.groupby("rider_ahead_last_lap")["pos_change"].max())

	import plotly.express as px
	px.box(df, x='rider_ahead_last_lap', y='pos_change')