Created June 24, 2016 13:18
cURL Fitbit community groups with pycurl, scrape pages with BeautifulSoup, send to Redis (wrote for getting data to create Fitbit dashboard on framework)
from bs4 import BeautifulSoup
import pycurl
import re
import os
from urllib import urlencode
from io import BytesIO
from StringIO import StringIO
import sys
import redis
import time
class getFitbitData:
cookieDir = './fbcookie.txt' #where we're storing our cookies
#Config Redis server we're connecting to
pool = redis.ConnectionPool( host='', port=6379,password='ifneeded',db=12 )
redisServer = redis.Redis( connection_pool=pool )
pipe = redisServer.pipeline()
# 2D array group name used in Redis key, and groups fitbit URL
groups = [["XXXX",""],["XXXX",""]]
date = time.strftime("%Y-%m-%d")
def __init__( self ):
#Where we store our Unity Crap
self.password = 'yourpass'
self.user = 'youruser'
def getHTML( self, groupURL, page ):
print "authenticate"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.SSL_VERIFYPEER, False)
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.TIMEOUT, 60)
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
c.setopt(c.URL, '')
c.setopt(c.WRITEFUNCTION, buffer.write)
html = str(buffer.getvalue())
#Get hidden values for post
if "_sourcePage" in html:
rex = re.compile( "input type=\"hidden\" name=\"_sourcePage\" value=\"(.*?)\"")
sourcepage = html ).groups()[0]
if "__fp" in html:
rex = re.compile( "input type=\"hidden\" name=\"__fp\" value=\"(.*)\"")
fp = html ).groups()[0]
datastuff = {'login':'Log In','disableThirdPartyLogin':'false','email':self.user,'password':self.password,'rememberMe':'true'}
#post datastuff
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
c.setopt(c.URL, '' )
c.setopt(c.COOKIEJAR, self.cookieDir)
c.setopt(c.COOKIEFILE, self.cookieDir )
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.POST, True)
c.setopt(c.POSTFIELDS, urlencode( datastuff ))
buffer = BytesIO()
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
c.setopt(c.URL, groupURL+'/leaders?timeWindow=CURRENT_MONTH&page='+page)
c.setopt(c.COOKIEJAR, self.cookieDir)
c.setopt(c.COOKIEFILE, self.cookieDir )
c.setopt(c.WRITEFUNCTION, buffer.write)
html = str(buffer.getvalue())
return html
# c.close()
def parseHTML( self, html, group ):
soup = BeautifulSoup(html, "html.parser")
count = 0
# find all a hrefs with class formlink
for leftCell in soup.find_all("div", {"class": "leaderboardCell left"}):
for mylink in leftCell.find_all("div", {"class": "info"}):
for link in mylink.find_all("a"):
name = (link.get_text())
for link in mylink.find_all("li", {"class": "stat ellipsis"}):
t = (link.get_text())
t = "".join(t.split())
t = t[:-5]
steps = int(t.replace(',', ''))
for link in mylink.find_all("li", {"class": "average ellipsis"}):
a = (link.get_text()[:-5])
avg = int(a.replace(',', ''))
print name
print steps
count += 1
return count
fit = getFitbitData()
#Run through groups if more then 25 listings on a page go to the next page
for group in fit.groups:
html = fit.getHTML( group[1], "0" )
listCount = fit.parseHTML( html, group[0] )
page = 1
while listCount == 25:
print "run again"
html = fit.getHTML( group[1], str(page) )
listCount = fit.parseHTML( html, group[0] )
page +=1
# Calculate Average - Pull all data from redis
# get total and start counter for every step count greater then 0
s = 0
d = 0
z = fit.redisServer.zrange(group[0]+':steps',0,-1,withscores=True)
for x in z:
if x[1] > 0:
s = s +x[1]
d += 1
# math for average
avg = s/d
print "main done"
