Skip to content

Instantly share code, notes, and snippets.

@jsanch
Last active August 10, 2016 07:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsanch/f786dba5aa070f743f25e32235350635 to your computer and use it in GitHub Desktop.
Save jsanch/f786dba5aa070f743f25e32235350635 to your computer and use it in GitHub Desktop.
# Getting IDB MOOC Data into Socrata
# Source: http://52.202.188.134:8888
import requests as re
import json
import numpy as np
import pandas as pd
def build_url(course_id,data_segment,pageSize=300000):
base_url = "http://52.202.188.134/v1/"
url = base_url + course_id +"."+ data_segment +"?pageSize="+str(pageSize)
return url
# course and data structure definition
course_id = "idb1x_2015_3t"
data_segments = ["users","certificates","courseware", "demographics", "enrollments"]
# useful columns (removes obsolete columns)
useful_columns = {
'users' : ["id","is_staff","is_active","is_superuser","last_login","date_joined"],
'certificates' : ["id","user_id","course_id","grade","status","name","created_date","modified_date","mode"],
'courseware' : ["id","module_type","module_id","student_id","state","grade","created","modified","max_grade","course_id"],
'demographics' : ["id", "user_id","language","location","gender","year_of_birth","level_of_education","goals","country"],
'enrollments' : ["id","user_id","course_id","created","is_active","mode"]}
df_segments = {}
for d in data_segments:
df = pd.DataFrame.from_dict(json.loads(re.get(build_url(course_id,d)).text)['results'])
df_segments[d] = pd.DataFrame(df, columns=useful_columns[d])
df.set_index("id")
df.to_csv(d+".csv")
# Geocode Countries
lat_long = pd.read_csv("https://gist.githubusercontent.com/jsanch/5f47ddc207f841f44c21dc9e4eaf70d5/raw/dcb855e1dfeb9fd27a7e2759be83c591b8a85788/country_latlon.csv")
lat_long.set_index("country", inplace=True)
def get(x,coor):
try:
if coor == "lat":
return lat_long.ix[x].latitude
else:
return lat_long.ix[x].longitude
except:
return ""
demo = df_segments['demographics']
demo['_lat'] = demo.country.apply(lambda x: get(x,"lat"))
demo['_long'] = demo.country.apply(lambda x: get(x,"long"))
demo.to_csv("demographics.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment