Last active
July 13, 2017 11:12
-
-
Save cjimmy/5ad1dd511a417ba55013 to your computer and use it in GitHub Desktop.
Python code that sorts through the Lavaman Triathlon 2015 results, cleans up data, and creates a csv file to be parsed by D3 and shown in data visualizations of the race seen in http://jimmychion.com/lavaman2015.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import timedelta | |
from enum import IntEnum | |
import csv | |
# enumerate the columns for readability | |
class Col(IntEnum): | |
place = 0 | |
divTot = 1 | |
bib = 2 | |
cat = 3 | |
firstName = 4 | |
lastName = 5 | |
age = 6 | |
sex = 7 | |
div = 8 | |
swimRank = 9 | |
swimTime = 10 | |
swimPace = 11 | |
t1Time = 12 | |
bikeRank = 13 | |
bikeTime = 14 | |
bikeSpeed = 15 | |
t2Rank = 16 | |
t2Time = 17 | |
runRank = 18 | |
runTime = 19 | |
runPace = 20 | |
totalTime = 21 | |
class StartTimes(IntEnum): | |
proElite = 0 | |
M044 = 5 | |
M4454 = 10 | |
M55 = 15 | |
F039 = 20 | |
F4049 = 25 | |
F50 = 30 | |
# Novice = 35 | |
wave2 = ['M119', 'M2024', 'M2529', 'M3034', 'M3539','M4044'] | |
wave3 = ['M4549', 'M5054'] | |
wave4 = ['M5559', 'M6064', 'M6569', 'M7074', 'M75PL'] | |
wave5 = ['F119', 'F2024', 'F2529', 'F3034', 'F3539'] | |
wave6 = ['F4044', 'F4549'] | |
wave7 = ['F5054', 'F5559', 'F6064', 'F6569', 'F7074', 'F75PL'] | |
#import lavaman results (the individual results copied into a different file) | |
r = [] | |
with open("results.txt") as f: | |
for line in f: | |
r.append(line.strip().split()) | |
# remove dirty data (i.e. results where times are clumped) | |
# not ideal, sorry to those 51 athletes whose stupid ankle bracelets didnt pick up | |
# TODO "clean" data by getting a best estimate of their rank by giving them an average time | |
# of the 10 people ranked around them. | |
r2 = [line for line in r if len(line) >= 22] | |
r = r2 | |
#combine long last names to get column indicies lined up | |
for line in r: | |
if not line[Col.age].isdigit(): | |
i = Col.age | |
while not line[i].isdigit(): | |
i = i+1 | |
for j in range(Col.age,i): | |
line[5] = line[5] + ' ' + line[j] | |
for j in range(Col.age, i): | |
line.remove(line[Col.age]) #dangerous to remove in place. removing and staying in the same place | |
#convert times to python timedelta objects | |
for line in r: | |
for i, datum in enumerate(line): | |
if datum.find(':') != -1: # if it's a time | |
t2 = datum.split(':') | |
if len(t2) == 2: # if it's "12:34" which doesnt have hours | |
datum = timedelta(minutes=int(t2[0]), seconds=int(t2[1])) | |
if len(t2) == 3: | |
datum = timedelta(hours=int(t2[0]), minutes=int(t2[1]), seconds=int(t2[2])) | |
line[i] = datum #not great form to modify without slice, but mehfor line in r: | |
#-- adding together the split times and comparing it to the finish time, it looks like the finish time is determined independent of the split times (that's good) | |
#-- and the split times are rounded up from the milliseconds. Code to show that: | |
# for line in r: | |
# print (line[swimTime] + line[t1Time] + line[bikeTime] + line[t2Time] + line[runTime]) - line[totalTime] | |
for athlete in r: | |
if athlete[Col.cat] == 'P': | |
athlete.append('1') | |
elif athlete[Col.div] in wave2: | |
athlete.append('2') | |
elif athlete[Col.div] in wave3: | |
athlete.append('3') | |
elif athlete[Col.div] in wave4: | |
athlete.append('4') | |
elif athlete[Col.div] in wave5: | |
athlete.append('5') | |
elif athlete[Col.div] in wave6: | |
athlete.append('6') | |
elif athlete[Col.div] in wave7: | |
athlete.append('7') | |
else: | |
print("not a recognized division") | |
# wave start offset | |
def getStartTime(athlete): | |
if athlete[Col.cat] == 'P': | |
startTime = timedelta(0); | |
elif athlete[Col.div] in wave2: | |
startTime = timedelta(minutes=5); | |
elif athlete[Col.div] in wave3: | |
startTime = timedelta(minutes=10); | |
elif athlete[Col.div] in wave4: | |
startTime = timedelta(minutes=15); | |
elif athlete[Col.div] in wave5: | |
startTime = timedelta(minutes=20); | |
elif athlete[Col.div] in wave6: | |
startTime = timedelta(minutes=25); | |
elif athlete[Col.div] in wave7: | |
startTime = timedelta(minutes=30); | |
else: | |
print("not a recognized division") | |
return None | |
return startTime | |
#-- find index of bib numbers in lists (i.e. rank-1) and find difference | |
def findIndexOfBibInLeg(targetBib, leg): | |
target = str(targetBib) | |
for i, row in enumerate(leg): | |
for j, bib in enumerate(row): | |
if bib == target: | |
return i | |
return None | |
# calculate the aggregated times | |
# overall absolute times, accounting for wave start times | |
timesByOverall = [] | |
for athlete in r: | |
times = [] | |
startTime = getStartTime(athlete) | |
times.append(athlete[Col.bib]) | |
times.append(startTime + athlete[Col.swimTime]) | |
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time]) | |
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime]) | |
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time]) | |
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime]) | |
timesByOverall.append(times) | |
# copypaste. not proud of it, okay? | |
# Create timesByOverallStartTogether, a list of each atheletes aggregate times, without accounting for waves | |
# i.e. these are the individual aggregate times after each leg if everybody started at the same time | |
timesByOverallStartTogether = [] | |
for athlete in r: | |
times = [] | |
times.append(athlete[Col.bib]) | |
times.append(athlete[Col.swimTime]) | |
times.append(athlete[Col.swimTime] + athlete[Col.t1Time]) | |
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime]) | |
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time]) | |
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime]) | |
timesByOverallStartTogether.append(times) | |
# create different sorted lists | |
def getKeySwim(row): | |
return row[1] | |
def getKeyT1(row): | |
return row[2] | |
def getKeyBike(row): | |
return row[3] | |
def getKeyT2(row): | |
return row[4] | |
def getKeyRun(row): | |
return row[5] | |
timesBySwim = sorted(timesByOverall, key=getKeySwim) | |
timesByT1 = sorted(timesByOverall, key=getKeyT1) | |
timesByBike = sorted(timesByOverall, key=getKeyBike) | |
timesByT2 = sorted(timesByOverall, key=getKeyT2) | |
timesByRun = sorted(timesByOverall, key=getKeyRun) | |
timesBySwimStartTogether = sorted(timesByOverallStartTogether, key=getKeySwim) | |
timesByT1StartTogether = sorted(timesByOverallStartTogether, key=getKeyT1) | |
timesByBikeStartTogether = sorted(timesByOverallStartTogether, key=getKeyBike) | |
timesByT2StartTogether = sorted(timesByOverallStartTogether, key=getKeyT2) | |
timesByRunStartTogether = sorted(timesByOverallStartTogether, key=getKeyRun) | |
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run, given wave starts | |
#-- it does this by taking your rank before a leg, and substracting your rank after that leg. | |
#-- positive means you passed that many people, negative means that many people passed you | |
def numPplPassed(bib): | |
nPassed = [] | |
nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1)) # number of people passed in T1, seems opposite but we're using rank, which is opposite (1 is better than 2) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByT1) - findIndexOfBibInLeg(bib, timesByBike)) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByBike) - findIndexOfBibInLeg(bib, timesByT2)) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByT2) - findIndexOfBibInLeg(bib, timesByRun)) | |
return nPassed | |
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run if everyone started together | |
def numPplPassedStartTogether(bib): | |
nPassed = [] | |
nPassed.append(findIndexOfBibInLeg(bib, timesBySwimStartTogether) - findIndexOfBibInLeg(bib, timesByT1StartTogether)) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByT1StartTogether) - findIndexOfBibInLeg(bib, timesByBikeStartTogether)) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByBikeStartTogether) - findIndexOfBibInLeg(bib, timesByT2StartTogether)) | |
nPassed.append(findIndexOfBibInLeg(bib, timesByT2StartTogether) - findIndexOfBibInLeg(bib, timesByRunStartTogether)) | |
return nPassed | |
#-- returns aggregate number of people you passed SO FAR after each leg T-1, Bike, T-2, and the run | |
# def aggregatePplPassed(bib): | |
# nPassed = [] | |
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1)) | |
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByBike)) | |
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT2)) | |
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByRun)) | |
# return nPassed | |
#-- returns the number of people you passed in waves that started before you | |
#-- algorithm: find your swim time and division | |
#-- then find all people in waves before you who had longer times = number of people you passed | |
#-- find all people in waves after you (i.e. excluding your group and preceding gorups) | |
#-- who had shorter swim times + wave start time = number of people passed you | |
#-- messy brute force way | |
def numPplPassedInSwim(bib): | |
targetAthlete = [] | |
youPassed = [] | |
passedYou = [] | |
for athlete in r: | |
if athlete[Col.bib] == str(bib): | |
targetAthlete = athlete | |
break | |
if athlete[Col.cat] == 'P': | |
passedYou = [athlete for athlete in r if ((athlete[Col.cat] != 'P') and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime]))] | |
elif targetAthlete[Col.div] in wave2: | |
excludingGroups = wave2 # and pros | |
youPassed = [athlete for athlete in r if (athlete[Col.cat] == 'P' and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
elif targetAthlete[Col.div] in wave3: | |
groupsBeforeYou = wave2 # and pros | |
excludingGroups = groupsBeforeYou + wave3 | |
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
elif targetAthlete[Col.div] in wave4: | |
groupsBeforeYou = wave2 + wave3 # and pros | |
excludingGroups = groupsBeforeYou + wave4 | |
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
elif targetAthlete[Col.div] in wave5: | |
groupsBeforeYou = wave2 + wave3 + wave4 # and pros | |
excludingGroups = groupsBeforeYou + wave5 | |
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
elif targetAthlete[Col.div] in wave6: | |
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 # and pros | |
excludingGroups = groupsBeforeYou + wave6 | |
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
elif targetAthlete[Col.div] in wave7: | |
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 + wave6 # and pros | |
excludingGroups = groupsBeforeYou + wave7 | |
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))] | |
return len(youPassed) - len(passedYou) | |
#--------------------------------------------------- | |
# append the calculated numbers onto the athlete object to be written to the file | |
for athlete in r: | |
athlete.append(numPplPassedInSwim(athlete[Col.bib])) #special case swim | |
passedNumbers = numPplPassed(athlete[Col.bib]) # for each athlete, calculate the number of people passed with wave starts in each leg | |
for stat in passedNumbers: | |
athlete.append(stat) | |
passedNumbersStartTogether = numPplPassedStartTogether(athlete[Col.bib]) # do the same but ignoring wave starts | |
for stat in passedNumbersStartTogether: | |
athlete.append(stat) | |
#-- annnnd append the estimated rank for t1 to show it in the percentiles | |
athlete.append(findIndexOfBibInLeg(athlete[Col.bib], timesByT1) + 1) # rank = index + 1 | |
#convert datetimes to seconds for d3 to use easier | |
for line in r: | |
for i, datum in enumerate(line): | |
if type(datum) is timedelta: | |
line[i] = int(datum.total_seconds()) | |
# when this script is run, results.csv is overwritten with the caluclated numbers | |
header = ['place','divTot','bib','cat','firstName','lastName','age','sex','div','swimRank','swimTime','swimPace','t1Time','bikeRank','bikeTime','bikeSpeed','t2Rank','t2Time','runRank','runTime','runPace','totalTime','wave','wavePassedSwim','wavePassedT1','wavePassedBike','wavePassedT2','wavePassedRun','startPassedT1','startPassedBike','startPassedT2','startPassedRun', 'estimatedT1Rank'] | |
with open('results.csv', 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile, delimiter=',') | |
writer.writerow(header) | |
for line in r: | |
writer.writerow(line) | |
#--------------------------------------------------- | |
#-- number of people "passed" is aggregate rank after leg_1 - aggregate rank after leg_2 | |
#-- if negative, people passed you | |
# print('In the swim, you passed %s people in the waves ahead of you' % numPplPassedInSwim(122)) | |
# print(numPplPassed(122)) | |
# print(aggregatePplPassed(122)) | |
# passed = [] | |
# for i in range(0, 1020): | |
# try: | |
# passed.append(findIndexOfBibInLeg(i, timesBySwim) - findIndexOfBibInLeg(i, timesByRun)) | |
# except (RuntimeError, TypeError, NameError): | |
# pass | |
# print(sorted(passed)) #-- to see the distribution of overall number of people passed | |
#convert to dictionary (key = bib num, value = list of aggregate times) | |
# algorithm | |
# given bib number, output number and list of people who you passed or passed you in a given leg | |
#----------- if everyone started at the same time | |
# compute aggregated time for each leg | |
# sort based on aggregated time (i.e. get overall rank at each leg finish) | |
# passed those with higher rank in previous leg | |
#----------- given wave start | |
# first wave starts at 00:00:00 | |
# second wave starts at 00:00:05 | |
# third wave starts at 00:00;10, etc | |
# rank is not given until after the athlete has completed the swim | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment