Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Foursquare data extraction
from collections import defaultdict
import datetime
import requests
import time
CHECKINS_API = 'https://api.foursquare.com/v2/users/self/checkins'
CATEGORIES_API = 'https://api.foursquare.com/v2/venues/categories'
YEAR_BEGIN = int(time.mktime(datetime.date(2013,1,1).timetuple()))
YEAR_END = int(time.mktime(datetime.date(2014,1,1).timetuple()))
WEEK = int(time.mktime(datetime.date(2013,12,29).timetuple()))
PARAMS = {
'oauth_token': 'TOKEN_HERE',
'v': '20140104',
}
SUBCATEGORIES = {}
MAIN_CATEGORIES = []
def add_subcategory(main, cat):
SUBCATEGORIES[cat['name'].encode('utf-8')] = main
if cat.get('categories', False):
for subcat in cat['categories']:
add_subcategory(main, subcat)
response = requests.get(CATEGORIES_API, params=PARAMS).json()
for cat in response['response']['categories']:
name = cat['name'].encode('utf-8')
SUBCATEGORIES[name] = name
MAIN_CATEGORIES.append(name)
add_subcategory(name, cat)
print 'Processed categories'
PARAMS.update({
'afterTimestamp': YEAR_BEGIN,
'beforeTimestamp': YEAR_END,
'limit': '250',
})
response = requests.get(CHECKINS_API, params=PARAMS).json()
print response['response']['checkins']['count'], 'checkins found'
count = 0
categories = defaultdict(int)
venues = defaultdict(int)
category_over_time = defaultdict(lambda: defaultdict(int))
with open('fsq_dump.csv', 'w') as output, open('fsq_category.csv', 'w') as category_output, open('fsq_venue.csv', 'w') as venue_output, open('fsq_category_over_time.csv', 'w') as category_over_time_output:
while response['response']['checkins']['items']:
for item in response['response']['checkins']['items']:
if item['type'] != 'checkin':
continue
category = item['venue']['categories'][0]['name'].encode('utf-8')
venue = item['venue']['name'].encode('utf-8')
last = item['createdAt']
categories[category] += 1
venues[venue] += 1
if int(last) < WEEK:
WEEK -= 60*60*24*7
category_over_time[WEEK][SUBCATEGORIES[category]] += 1
output.write('{},{},{}\n'.format(item['createdAt'], venue, category))
count += 1
if count % 50 == 0:
print count, 'processed'
PARAMS['beforeTimestamp'] = last
response = requests.get(CHECKINS_API, params=PARAMS).json()
[category_output.write('{},{}\n'.format(*c)) for c in sorted(categories.iteritems(), key=lambda x: x[1], reverse=True)]
[venue_output.write('{},{}\n'.format(*c)) for c in sorted(venues.iteritems(), key=lambda x: x[1], reverse=True)]
sorted_categories_only = sorted(categories.iteritems(), key=lambda x: x[1], reverse=True)
category_over_time_output.write('Week,{}\n'.format(','.join(MAIN_CATEGORIES)))
[category_over_time_output.write('{},{}\n'.format(w, ','.join([str(category_over_time[w][c]) for c in MAIN_CATEGORIES]))) for w in sorted(category_over_time)]
print 'Done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment