Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python code that sorts through the Lavaman Triathlon 2015 results, cleans up data, and creates a csv file to be parsed by D3 and shown in data visualizations of the race seen in http://jimmychion.com/lavaman2015.html
from datetime import timedelta
from enum import IntEnum
import csv
# enumerate the columns for readability
class Col(IntEnum):
place = 0
divTot = 1
bib = 2
cat = 3
firstName = 4
lastName = 5
age = 6
sex = 7
div = 8
swimRank = 9
swimTime = 10
swimPace = 11
t1Time = 12
bikeRank = 13
bikeTime = 14
bikeSpeed = 15
t2Rank = 16
t2Time = 17
runRank = 18
runTime = 19
runPace = 20
totalTime = 21
class StartTimes(IntEnum):
proElite = 0
M044 = 5
M4454 = 10
M55 = 15
F039 = 20
F4049 = 25
F50 = 30
# Novice = 35
wave2 = ['M119', 'M2024', 'M2529', 'M3034', 'M3539','M4044']
wave3 = ['M4549', 'M5054']
wave4 = ['M5559', 'M6064', 'M6569', 'M7074', 'M75PL']
wave5 = ['F119', 'F2024', 'F2529', 'F3034', 'F3539']
wave6 = ['F4044', 'F4549']
wave7 = ['F5054', 'F5559', 'F6064', 'F6569', 'F7074', 'F75PL']
#import lavaman results (the individual results copied into a different file)
r = []
with open("results.txt") as f:
for line in f:
r.append(line.strip().split())
# remove dirty data (i.e. results where times are clumped)
# not ideal, sorry to those 51 athletes whose stupid ankle bracelets didnt pick up
# TODO "clean" data by getting a best estimate of their rank by giving them an average time
# of the 10 people ranked around them.
r2 = [line for line in r if len(line) >= 22]
r = r2
#combine long last names to get column indicies lined up
for line in r:
if not line[Col.age].isdigit():
i = Col.age
while not line[i].isdigit():
i = i+1
for j in range(Col.age,i):
line[5] = line[5] + ' ' + line[j]
for j in range(Col.age, i):
line.remove(line[Col.age]) #dangerous to remove in place. removing and staying in the same place
#convert times to python timedelta objects
for line in r:
for i, datum in enumerate(line):
if datum.find(':') != -1: # if it's a time
t2 = datum.split(':')
if len(t2) == 2: # if it's "12:34" which doesnt have hours
datum = timedelta(minutes=int(t2[0]), seconds=int(t2[1]))
if len(t2) == 3:
datum = timedelta(hours=int(t2[0]), minutes=int(t2[1]), seconds=int(t2[2]))
line[i] = datum #not great form to modify without slice, but mehfor line in r:
#-- adding together the split times and comparing it to the finish time, it looks like the finish time is determined independent of the split times (that's good)
#-- and the split times are rounded up from the milliseconds. Code to show that:
# for line in r:
# print (line[swimTime] + line[t1Time] + line[bikeTime] + line[t2Time] + line[runTime]) - line[totalTime]
for athlete in r:
if athlete[Col.cat] == 'P':
athlete.append('1')
elif athlete[Col.div] in wave2:
athlete.append('2')
elif athlete[Col.div] in wave3:
athlete.append('3')
elif athlete[Col.div] in wave4:
athlete.append('4')
elif athlete[Col.div] in wave5:
athlete.append('5')
elif athlete[Col.div] in wave6:
athlete.append('6')
elif athlete[Col.div] in wave7:
athlete.append('7')
else:
print("not a recognized division")
# wave start offset
def getStartTime(athlete):
if athlete[Col.cat] == 'P':
startTime = timedelta(0);
elif athlete[Col.div] in wave2:
startTime = timedelta(minutes=5);
elif athlete[Col.div] in wave3:
startTime = timedelta(minutes=10);
elif athlete[Col.div] in wave4:
startTime = timedelta(minutes=15);
elif athlete[Col.div] in wave5:
startTime = timedelta(minutes=20);
elif athlete[Col.div] in wave6:
startTime = timedelta(minutes=25);
elif athlete[Col.div] in wave7:
startTime = timedelta(minutes=30);
else:
print("not a recognized division")
return None
return startTime
#-- find index of bib numbers in lists (i.e. rank-1) and find difference
def findIndexOfBibInLeg(targetBib, leg):
target = str(targetBib)
for i, row in enumerate(leg):
for j, bib in enumerate(row):
if bib == target:
return i
return None
# calculate the aggregated times
# overall absolute times, accounting for wave start times
timesByOverall = []
for athlete in r:
times = []
startTime = getStartTime(athlete)
times.append(athlete[Col.bib])
times.append(startTime + athlete[Col.swimTime])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time])
times.append(startTime + athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime])
timesByOverall.append(times)
# copypaste. not proud of it, okay?
# Create timesByOverallStartTogether, a list of each atheletes aggregate times, without accounting for waves
# i.e. these are the individual aggregate times after each leg if everybody started at the same time
timesByOverallStartTogether = []
for athlete in r:
times = []
times.append(athlete[Col.bib])
times.append(athlete[Col.swimTime])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time])
times.append(athlete[Col.swimTime] + athlete[Col.t1Time] + athlete[Col.bikeTime] + athlete[Col.t2Time] + athlete[Col.runTime])
timesByOverallStartTogether.append(times)
# create different sorted lists
def getKeySwim(row):
return row[1]
def getKeyT1(row):
return row[2]
def getKeyBike(row):
return row[3]
def getKeyT2(row):
return row[4]
def getKeyRun(row):
return row[5]
timesBySwim = sorted(timesByOverall, key=getKeySwim)
timesByT1 = sorted(timesByOverall, key=getKeyT1)
timesByBike = sorted(timesByOverall, key=getKeyBike)
timesByT2 = sorted(timesByOverall, key=getKeyT2)
timesByRun = sorted(timesByOverall, key=getKeyRun)
timesBySwimStartTogether = sorted(timesByOverallStartTogether, key=getKeySwim)
timesByT1StartTogether = sorted(timesByOverallStartTogether, key=getKeyT1)
timesByBikeStartTogether = sorted(timesByOverallStartTogether, key=getKeyBike)
timesByT2StartTogether = sorted(timesByOverallStartTogether, key=getKeyT2)
timesByRunStartTogether = sorted(timesByOverallStartTogether, key=getKeyRun)
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run, given wave starts
#-- it does this by taking your rank before a leg, and substracting your rank after that leg.
#-- positive means you passed that many people, negative means that many people passed you
def numPplPassed(bib):
nPassed = []
nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1)) # number of people passed in T1, seems opposite but we're using rank, which is opposite (1 is better than 2)
nPassed.append(findIndexOfBibInLeg(bib, timesByT1) - findIndexOfBibInLeg(bib, timesByBike))
nPassed.append(findIndexOfBibInLeg(bib, timesByBike) - findIndexOfBibInLeg(bib, timesByT2))
nPassed.append(findIndexOfBibInLeg(bib, timesByT2) - findIndexOfBibInLeg(bib, timesByRun))
return nPassed
#-- returns number of how many people you passed IN EACH OF the legs T-1, Bike, T-2, and the run if everyone started together
def numPplPassedStartTogether(bib):
nPassed = []
nPassed.append(findIndexOfBibInLeg(bib, timesBySwimStartTogether) - findIndexOfBibInLeg(bib, timesByT1StartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByT1StartTogether) - findIndexOfBibInLeg(bib, timesByBikeStartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByBikeStartTogether) - findIndexOfBibInLeg(bib, timesByT2StartTogether))
nPassed.append(findIndexOfBibInLeg(bib, timesByT2StartTogether) - findIndexOfBibInLeg(bib, timesByRunStartTogether))
return nPassed
#-- returns aggregate number of people you passed SO FAR after each leg T-1, Bike, T-2, and the run
# def aggregatePplPassed(bib):
# nPassed = []
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT1))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByBike))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByT2))
# nPassed.append(findIndexOfBibInLeg(bib, timesBySwim) - findIndexOfBibInLeg(bib, timesByRun))
# return nPassed
#-- returns the number of people you passed in waves that started before you
#-- algorithm: find your swim time and division
#-- then find all people in waves before you who had longer times = number of people you passed
#-- find all people in waves after you (i.e. excluding your group and preceding gorups)
#-- who had shorter swim times + wave start time = number of people passed you
#-- messy brute force way
def numPplPassedInSwim(bib):
targetAthlete = []
youPassed = []
passedYou = []
for athlete in r:
if athlete[Col.bib] == str(bib):
targetAthlete = athlete
break
if athlete[Col.cat] == 'P':
passedYou = [athlete for athlete in r if ((athlete[Col.cat] != 'P') and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime]))]
elif targetAthlete[Col.div] in wave2:
excludingGroups = wave2 # and pros
youPassed = [athlete for athlete in r if (athlete[Col.cat] == 'P' and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave3:
groupsBeforeYou = wave2 # and pros
excludingGroups = groupsBeforeYou + wave3
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave4:
groupsBeforeYou = wave2 + wave3 # and pros
excludingGroups = groupsBeforeYou + wave4
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave5:
groupsBeforeYou = wave2 + wave3 + wave4 # and pros
excludingGroups = groupsBeforeYou + wave5
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave6:
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 # and pros
excludingGroups = groupsBeforeYou + wave6
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
elif targetAthlete[Col.div] in wave7:
groupsBeforeYou = wave2 + wave3 + wave4 + wave5 + wave6 # and pros
excludingGroups = groupsBeforeYou + wave7
youPassed = [athlete for athlete in r if ((athlete[Col.cat] == 'P' or (athlete[Col.div] in groupsBeforeYou)) and (athlete[Col.swimTime] > targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
passedYou = [athlete for athlete in r if (athlete[Col.cat] != 'P' and (athlete[Col.div] not in excludingGroups) and (athlete[Col.swimTime] + getStartTime(athlete) < targetAthlete[Col.swimTime] + getStartTime(targetAthlete)))]
return len(youPassed) - len(passedYou)
#---------------------------------------------------
# append the calculated numbers onto the athlete object to be written to the file
for athlete in r:
athlete.append(numPplPassedInSwim(athlete[Col.bib])) #special case swim
passedNumbers = numPplPassed(athlete[Col.bib]) # for each athlete, calculate the number of people passed with wave starts in each leg
for stat in passedNumbers:
athlete.append(stat)
passedNumbersStartTogether = numPplPassedStartTogether(athlete[Col.bib]) # do the same but ignoring wave starts
for stat in passedNumbersStartTogether:
athlete.append(stat)
#-- annnnd append the estimated rank for t1 to show it in the percentiles
athlete.append(findIndexOfBibInLeg(athlete[Col.bib], timesByT1) + 1) # rank = index + 1
#convert datetimes to seconds for d3 to use easier
for line in r:
for i, datum in enumerate(line):
if type(datum) is timedelta:
line[i] = int(datum.total_seconds())
# when this script is run, results.csv is overwritten with the caluclated numbers
header = ['place','divTot','bib','cat','firstName','lastName','age','sex','div','swimRank','swimTime','swimPace','t1Time','bikeRank','bikeTime','bikeSpeed','t2Rank','t2Time','runRank','runTime','runPace','totalTime','wave','wavePassedSwim','wavePassedT1','wavePassedBike','wavePassedT2','wavePassedRun','startPassedT1','startPassedBike','startPassedT2','startPassedRun', 'estimatedT1Rank']
with open('results.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(header)
for line in r:
writer.writerow(line)
#---------------------------------------------------
#-- number of people "passed" is aggregate rank after leg_1 - aggregate rank after leg_2
#-- if negative, people passed you
# print('In the swim, you passed %s people in the waves ahead of you' % numPplPassedInSwim(122))
# print(numPplPassed(122))
# print(aggregatePplPassed(122))
# passed = []
# for i in range(0, 1020):
# try:
# passed.append(findIndexOfBibInLeg(i, timesBySwim) - findIndexOfBibInLeg(i, timesByRun))
# except (RuntimeError, TypeError, NameError):
# pass
# print(sorted(passed)) #-- to see the distribution of overall number of people passed
#convert to dictionary (key = bib num, value = list of aggregate times)
# algorithm
# given bib number, output number and list of people who you passed or passed you in a given leg
#----------- if everyone started at the same time
# compute aggregated time for each leg
# sort based on aggregated time (i.e. get overall rank at each leg finish)
# passed those with higher rank in previous leg
#----------- given wave start
# first wave starts at 00:00:00
# second wave starts at 00:00:05
# third wave starts at 00:00;10, etc
# rank is not given until after the athlete has completed the swim
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment