Skip to content

Instantly share code, notes, and snippets.

@EricZeiberg
Created November 8, 2015 20:10
Show Gist options
  • Save EricZeiberg/982e5a240533276a69b8 to your computer and use it in GitHub Desktop.
Save EricZeiberg/982e5a240533276a69b8 to your computer and use it in GitHub Desktop.
Python script to calculate the effect of sleep on perceived effort on runs
from bs4 import BeautifulSoup, NavigableString, Tag
import numpy as np
from sklearn import cross_validation
from sklearn.linear_model import Ridge
def parseHTML(html):
scaleArray = []
sleepArray = []
soup = BeautifulSoup(html, "html.parser")
logs = soup.find_all("form")[0]
i = 0
for t in logs.find_all("table", class_="encapsule"):
if i == 0:
i+=1 # First table node is something else, so skip the first result of array
continue
comments = t.find_all("td", class_="pad-reset")[0].find_all("tr")[3].find_all("td")[1]
containsScale = False
for br in comments.find_all("br"):
next = br.nextSibling # Next few lines split the comment by <br> tags
if not (next and isinstance(next, NavigableString)):
continue
next2 = next.nextSibling
if next2 and isinstance(next2,Tag) and next2.name == 'br':
text = str(next).strip()
if text:
if "SCALE: " in text:
scaleArray.append(int(text.split(": ")[1]))
containsScale = True
break
if containsScale == False:
continue # If no scale data found in comment, skip finding sleep and go to next entry
sleepHTML = t.find_all("td", class_="pad-reset")[0].find_all("div")[1].find_all("div")[0].find_all("tr")[1].find_all("td")[1]
sleep = sleepHTML.text
sleep = sleep.strip()
if sleep == "--":
sleepArray.append(sum(sleepArray)/float(len(sleepArray))) # If no sleep entered for that day, fill in with mean of all collected sleep data so far
continue
sleepArray.append(float(sleep.split(" ")[0]))
sleepArray = np.reshape(sleepArray, (len(sleepArray), 1)) # Sklearn lib likes data in a [data, 1] format, so this reshapes the array
scaleArray = np.reshape(scaleArray, (len(scaleArray), 1))
return [sleepArray, scaleArray]
html = ""
with open ("R2WTest.html", "r") as myfile: # Downloaded the HTML from Running2Win for 2014-2015 as training data
html = myfile.read()
data = parseHTML(html)
ridge = Ridge() # Tried to fit this data with a couple other regression modules, this one gave the best result
ridge.fit(data[0], data[1])
test = ""
with open ("R2Win2015.html", "r") as myfile: # HTML from 2015-2016 used as test data to calculate model score
test = myfile.read()
testData = parseHTML(test)
#print(ridge.score(testData[0], testData[1])) - Returns Coefficient of determination as very close to 0, so model fits very well
print ridge.predict(10) # 7.27747915
print ridge.predict(1) # 7.19472833
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment