Skip to content

Instantly share code, notes, and snippets.

@cjimmy cjimmy/lavaman2015-pass.py
Last active Jul 13, 2017

Embed
What would you like to do?
Python code that sorts through the Lavaman Triathlon 2015 results, cleans up data, and creates a csv file to be parsed by D3 and shown in data visualizations of the race seen in http://jimmychion.com/lavaman2015.html
from datetime import timedelta
from enum import IntEnum
import csv
# enumerate the columns for readability
class Col(IntEnum):
place = 0
divTot = 1
bib = 2
cat = 3
firstName = 4
lastName = 5
age = 6
sex = 7
div = 8
swimRank = 9
swimTime = 10
swimPace = 11
t1Time = 12
bikeRank = 13
bikeTime = 14
bikeSpeed = 15
t2Rank = 16
t2Time = 17
runRank = 18
runTime = 19
runPace = 20
totalTime = 21
class StartTimes(IntEnum):
proElite = 0
M044 = 5
M4454 = 10
M55 = 15
F039 = 20
F4049 = 25
F50 = 30
# Novice = 35
wave2 = ['M119', 'M2024', 'M2529', 'M3034', 'M3539','M4044']
wave3 = ['M4549', 'M5054']
wave4 = ['M5559', 'M6064', 'M6569', 'M7074', 'M75PL']
wave5 = ['F119', 'F2024', 'F2529', 'F3034', 'F3539']
wave6 = ['F4044', 'F4549']
wave7 = ['F5054', 'F5559', 'F6064', 'F6569', 'F7074', 'F75PL']
#import lavaman results (the individual results copied into a different file)
r = []
with open("results.txt") as f:
for line in f:
r.append(line.strip().split())
# remove dirty data (i.e. results where times are clumped)
# not ideal, sorry to those 51 athletes whose stupid ankle bracelets didnt pick up
# TODO "clean" data by getting a best estimate of their rank by giving them an average time
# of the 10 people ranked around them.
r2 = [line for line in r if len(line) >= 22]
r = r2
#combine long last names to get column indicies lined up
for line in r:
if not line[Col.age].isdigit():
i = Col.age
while not line[i].isdigit():
i = i+1
for j in range(Col.age,i):
line[5] = line[5] + ' ' + line[j]
for j in range(Col.age, i):
line.remove(line[Col.age]) #dangerous to remove in place. removing and staying in the same place
#convert times to python timedelta objects
for line in r:
for i, datum in enumerate(line):
if datum.find(':') != -1: # if it's a time
t2 = datum.split(':')
if len(t2) == 2: # if it's "12:34" which doesnt have hours
datum = timedelta(minutes=int(t2[0]), seconds=int(t2[1]))
if len(t2) == 3:
datum = timedelta(hours=int(t2[0]), minutes=int(t2[1]), seconds=int(t2[2]))
line[i] = datum #not great form to modify without slice, but mehfor line in r:
#-- adding together the split times and comparing it to the finish time, it looks like the finish time is determined independent of the split times (that's good)
#-- and the split times are rounded up from the milliseconds. Code to show that:
# for line in r:
# print (line[swimTime] + line[t1Time] + line[bikeTime] + line[t2Time] + line[runTime]) - line[totalTime]
for athlete in r:
if athlete[Col.cat] == 'P':
athlete.append('1')
elif athlete[Col.div] in wave2:
athlete.append('2')
elif athlete[Col.div] in wave3:
athlete.append('3')
elif athlete[Col.div] in wave4:
athlete.append('4')
elif athlete[Col.div] in wave5:
athlete.append('5')
elif athlete[Col.div] in wave6:
athlete.append('6')
elif athlete[Col.div] in wave7:
athlete.append('7')
else:
print("not a recognized division")
# wave start offset
def getStartTime(athlete):
if athlete[Col.cat] == 'P':
startTime = timedelta(0);
elif athlete[Col.div] in wave2:
startTime = timedelta(minutes=5);
elif athlete[Col.div] in wave3:
startTime = timedelta(minutes=10);
elif athlete[Col.div] in wave4:
startTime = timedelta(minutes=15);
elif athlete[Col.div] in wave5:
startTime = timedelta(minutes=20);
elif athlete[Col.div] in wave6:
startTime = timedelta(minutes=25);
elif athlete[Col.div] in wave7:
startTime = timedelta(minutes=30);
else:
print("not a recognized division")
return None
return startTime
#-- find index of bib numbers in lists (i.e. rank-1) and find difference
def findIndexOfBibInLeg(targetBib, leg):
target = str(targetBib)
for i, row in enumerate(leg):
for j, bib in enumerate(row):
if bib == target:
return i
return None
# calculate the aggregated times
# overall absolute times, accounting for wave start times
timesByOverall = []
for athlete in r:
times = []
startTime = getStartTime(athlete)
times.append(athlete[Col.bib])
times.append(startTime + athlete[Col.swimTime])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime])
timesByOverall.append(times)
# copypaste. not proud of it, okay?
# Create timesByOverallStartTogether, a list of each atheletes aggregate times, without accounting for waves
# i.e. these are the individual aggregate times after each leg if everybody started at the same time
timesByOverallStartTogether = []
for athlete in r:
times = []
times.append(athlete[Col.bib])
times.append(athlete[Col.swimTime])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime])
timesByOverallStartTogether.append(times)
# create different sorted lists
def getKeySwim(row):
return row[1]
def getKeyT1(row):
return row[2]
def getKeyBike(row):
return row[3]
def getKeyT2(row):
return row[4]
def getKeyRun(row):
return row[5]
timesBySwim = sorted(timesByOverall, key=getKeySwim)
timesByT1 = sorted(timesByOverall, key=getKeyT1)
timesByBike = sorted(timesByOverall, key=getKeyBike)
timesByT2 = sorted(timesByOverall, key=getKeyT2)
timesByRun = sorted(timesByOverall, key=getKeyRun)
timesBySwimStartTogether = sorted(timesByOverallStartTogether, key=getKeySwim)
timesByT1StartTogether = sorted(timesByOverallStartTogether, key=getKeyT1)
timesByBikeStartTogether = sorted(timesByOverallStartTogether, key=getKeyBike)
timesByT2StartTogether = sorted(timesByOverallStartTogether, key=getKeyT2)
timesByRunStartTogether = sorted(timesByOverallStartTogether, key=getKeyRun)
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run, given wave starts
#-- it does this by taking your rank before a leg, and substracting your rank after that leg.
#-- positive means you passed that many people, negative means that many people passed you
def numPplPassed(bib):
nPassed = []
nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1)) # number of people passed in T1, seems opposite but we're using rank, which is opposite (1 is better than 2)
nPassed.append(findIndexOfBibInLeg(bib, timesByT1) - findIndexOfBibInLeg(bib, timesByBike))
nPassed.append(findIndexOfBibInLeg(bib, timesByBike) - findIndexOfBibInLeg(bib, timesByT2))
nPassed.append(findIndexOfBibInLeg(bib, timesByT2) - findIndexOfBibInLeg(bib, timesByRun))
return nPassed
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run if everyone started together
def numPplPassedStartTogether(bib):
nPassed = []
nPassed.append(findIndexOfBibInLeg(bib, timesBySwimStartTogether) - findIndexOfBibInLeg(bib, timesByT1StartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByT1StartTogether) - findIndexOfBibInLeg(bib, timesByBikeStartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByBikeStartTogether) - findIndexOfBibInLeg(bib, timesByT2StartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByT2StartTogether) - findIndexOfBibInLeg(bib, timesByRunStartTogether))
return nPassed
#-- returns aggregate number of people you passed SO FAR after each leg T-1, Bike, T-2, and the run
# def aggregatePplPassed(bib):
# nPassed = []
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByBike))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT2))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByRun))
# return nPassed
#-- returns the number of people you passed in waves that started before you
#-- algorithm: find your swim time and division
#-- then find all people in waves before you who had longer times = number of people you passed
#-- find all people in waves after you (i.e. excluding your group and preceding gorups)
#-- who had shorter swim times + wave start time = number of people passed you
#-- messy brute force way
def numPplPassedInSwim(bib):
targetAthlete = []
youPassed = []
passedYou = []
for athlete in r:
if athlete[Col.bib] == str(bib):
targetAthlete = athlete
break
if athlete[Col.cat] == 'P':
passedYou = [athlete for athlete in r if ((athlete[Col.cat] != 'P') and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime]))]
elif targetAthlete[Col.div] in wave2:
excludingGroups = wave2 # and pros
youPassed = [athlete for athlete in r if (athlete[Col.cat] == 'P' and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave3:
groupsBeforeYou = wave2 # and pros
excludingGroups = groupsBeforeYou + wave3
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave4:
groupsBeforeYou = wave2 + wave3 # and pros
excludingGroups = groupsBeforeYou + wave4
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave5:
groupsBeforeYou = wave2 + wave3 + wave4 # and pros
excludingGroups = groupsBeforeYou + wave5
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave6:
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 # and pros
excludingGroups = groupsBeforeYou + wave6
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave7:
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 + wave6 # and pros
excludingGroups = groupsBeforeYou + wave7
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
return len(youPassed) - len(passedYou)
#---------------------------------------------------
# append the calculated numbers onto the athlete object to be written to the file
for athlete in r:
athlete.append(numPplPassedInSwim(athlete[Col.bib])) #special case swim
passedNumbers = numPplPassed(athlete[Col.bib]) # for each athlete, calculate the number of people passed with wave starts in each leg
for stat in passedNumbers:
athlete.append(stat)
passedNumbersStartTogether = numPplPassedStartTogether(athlete[Col.bib]) # do the same but ignoring wave starts
for stat in passedNumbersStartTogether:
athlete.append(stat)
#-- annnnd append the estimated rank for t1 to show it in the percentiles
athlete.append(findIndexOfBibInLeg(athlete[Col.bib], timesByT1) + 1) # rank = index + 1
#convert datetimes to seconds for d3 to use easier
for line in r:
for i, datum in enumerate(line):
if type(datum) is timedelta:
line[i] = int(datum.total_seconds())
# when this script is run, results.csv is overwritten with the caluclated numbers
header = ['place','divTot','bib','cat','firstName','lastName','age','sex','div','swimRank','swimTime','swimPace','t1Time','bikeRank','bikeTime','bikeSpeed','t2Rank','t2Time','runRank','runTime','runPace','totalTime','wave','wavePassedSwim','wavePassedT1','wavePassedBike','wavePassedT2','wavePassedRun','startPassedT1','startPassedBike','startPassedT2','startPassedRun', 'estimatedT1Rank']
with open('results.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(header)
for line in r:
writer.writerow(line)
#---------------------------------------------------
#-- number of people "passed" is aggregate rank after leg_1 - aggregate rank after leg_2
#-- if negative, people passed you
# print('In the swim, you passed %s people in the waves ahead of you' % numPplPassedInSwim(122))
# print(numPplPassed(122))
# print(aggregatePplPassed(122))
# passed = []
# for i in range(0, 1020):
# try:
# passed.append(findIndexOfBibInLeg(i, timesBySwim) - findIndexOfBibInLeg(i, timesByRun))
# except (RuntimeError, TypeError, NameError):
# pass
# print(sorted(passed)) #-- to see the distribution of overall number of people passed
#convert to dictionary (key = bib num, value = list of aggregate times)
# algorithm
# given bib number, output number and list of people who you passed or passed you in a given leg
#----------- if everyone started at the same time
# compute aggregated time for each leg
# sort based on aggregated time (i.e. get overall rank at each leg finish)
# passed those with higher rank in previous leg
#----------- given wave start
# first wave starts at 00:00:00
# second wave starts at 00:00:05
# third wave starts at 00:00;10, etc
# rank is not given until after the athlete has completed the swim
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.