Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Created October 23, 2013 01:35
Show Gist options
  • Save marcelcaraciolo/7111066 to your computer and use it in GitHub Desktop.
Save marcelcaraciolo/7111066 to your computer and use it in GitHub Desktop.
import urllib2
import urllib
from BeautifulSoup import BeautifulSoup
import cookielib
import requests
import re
URL_BASE = 'http://coursetalk.org/'
cursos = None
ratings = None
users = None
tags = None
professors = None
client = None
stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0',
'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5',
'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0',
'stars s1': '0.5', 'stars s0': '0.0'}
def session_login():
global client
import sys
import requests
URL = 'http://coursetalk.org/login'
client = requests.session()
# Retrieve the CSRF token first
client.get(URL) # sets cookie
csrftoken = client.cookies['csrftoken']
login_data = dict(email='caraciol@gmail.com', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/')
client.post(URL, data=login_data, headers=dict(Referer=URL))
return client
from unicodedata import normalize
def remover_acentos(txt):
if txt:
return normalize('NFKD', txt).encode('ASCII','ignore')
else:
return ''
def create_files():
global cursos, ratings, users, tags, professors
cursos = open('cursos.dat', 'w')
ratings = open('ratings.dat', 'w')
users = open('users.dat', 'w')
tags = open('course-tags.dat', 'w')
professors = open('course-professors.dat', 'w')
def crawl_description(url):
html_data = urllib2.urlopen(url).read()
html_parsed = BeautifulSoup(html_data)
description = html_parsed.find('p')
return description.contents[0]
def crawl_inside_course(url):
html_data = urllib2.urlopen(url).read()
html_parsed = BeautifulSoup(html_data)
data = {}
box = html_parsed.find('div', {'class': 'course_box'})
professors = box.find('h5')
professores = []
for professor in professors.contents[0].split(','):
professores.append(professor.replace('—', '').replace(' ', '').strip())
#print professores
data['professors'] = professores
try:
rating = box.find('div', {'class': 'score'}).contents[0]
data['rating'] = rating
except AttributeError:
data['rating'] = ''
#print rating
difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0]
data['difficulty'] = difficulty
#print difficulty
workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0]
data['workload'] = workload
try:
workload = re.search(r'<(.*?)>(.*?)</(.*?)>', str(workload)).group(2)
data['workload'] = unicode(workload)
except AttributeError:
pass
#print workload
topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'})
topics = []
for topic in topics_l:
topics.append(topic.contents[0])
#print topics
data['topics'] = topics
data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src'])
#print description
print data
return data
def crawl_course(url):
print url
html_data = urllib2.urlopen(url).read()
html_parsed = BeautifulSoup(html_data)
table = html_parsed.find('table', {'class': 'table course_list'})
for course in table.findAll('tr')[1:]:
c = {}
if course.findAll('td')[0].find('a'):
c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '')
else:
c['provider'] = None
c['slug'] = course.findAll('td')[1].find('a')['href']
c['name'] = course.findAll('td')[1].find('a').contents[0]
if len(course.findAll('td')[1].findAll('a')) > 1:
c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0]
else:
c['university'] = None
data = crawl_inside_course(URL_BASE + c['slug'])
c['difficulty'] = data['difficulty']
c['rating'] = data['rating']
c['topics'] = data['topics']
c['description'] = data['description']
c['professors'] = data['professors']
c['workload'] = data['workload']
yield c
def crawl_reviews(url, client):
total = 0
pg = 1
raw_html = client.get(url + '/?page=1').text
html_parsed = BeautifulSoup(raw_html)
try:
pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0]
pages = int(re.search(r'of ([\d]+)', pages).group(1))
except AttributeError:
if 'No reviews yet' in raw_html:
pages = 0
else:
print 'somente 1 pagina'
pages = 1
autores = []
ratings = []
while pg <= pages:
for review in html_parsed.findAll('tr', {"class": 'review-tr'}):
try:
author = review.find('a')['href'].replace('/u/', '')
except KeyError:
author = 'anonymous'
stars = review.find('div', {'class': re.compile('stars')})
autores.append(author)
ratings.append(stars2ratings[stars['class']])
total +=1
pg+=1
html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text)
print 'total crawleado de reviews', total
return autores, ratings
def crawl_courses(client):
total = 0
id_curso = 1
id_user = 1
users_set = {}
for pg in range(1, 113):
url = URL_BASE + '?page=%d' % pg
for course in crawl_course(url):
print course
cursos.write('%d|%s|%s|%s|%s|%s|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']),
remover_acentos(course['university']), course['difficulty'], course['provider']))
for tag in course['topics']:
tags.write('%d|%s\n' % (id_curso, tag))
for professor in course['professors']:
professors.write('%d|%s\n' % (id_curso, remover_acentos(professor)))
autores, rt = crawl_reviews(URL_BASE + course['slug'], client)
for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)):
if autor not in users_set:
users_set[autor] = id_user
id_user +=1
users.write('%d|%s\n' % (users_set[autor], autor))
ratings.write('%d|%d|%s\n' % (users_set[autor], curso, nota))
if autor == 'anonymous':
del users_set[autor]
id_curso +=1
total +=1
print 'total de paginas importados: ', total
print 'total de cursos importados ', id_curso
cursos.close()
users.close()
ratings.close()
professors.close()
tags.close()
if __name__ == '__main__':
global client
create_files()
client = session_login()
crawl_courses(client)
#crawl_inside_course(URL_BASE + '/coursera/pre-calculus')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment