marcelcaraciolo/coursetalk.py

## coursetalk.py
import urllib2
import urllib
from BeautifulSoup import BeautifulSoup
import cookielib
import requests
import re

URL_BASE = 'http://coursetalk.org/'
cursos = None
ratings = None
users = None
tags = None
professors = None
client = None

stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0',
				'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5',
				'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0',
				'stars s1': '0.5', 'stars s0': '0.0'}

def session_login():
	global client
	import sys
	import requests

	URL = 'http://coursetalk.org/login'

	client = requests.session()

	# Retrieve the CSRF token first
	client.get(URL)  # sets cookie
	csrftoken = client.cookies['csrftoken']

	login_data = dict(email='caraciol@gmail.com', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/')
	client.post(URL, data=login_data, headers=dict(Referer=URL))

	return client

from unicodedata import normalize
def remover_acentos(txt):
	if txt:
		return normalize('NFKD', txt).encode('ASCII','ignore')
	else:
		return ''

def create_files():
	global cursos, ratings, users, tags, professors
	cursos = open('cursos.dat', 'w')
	ratings = open('ratings.dat', 'w')
	users = open('users.dat', 'w')
	tags = open('course-tags.dat', 'w')
	professors = open('course-professors.dat', 'w')


def crawl_description(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	description =  html_parsed.find('p')
	return description.contents[0]


def crawl_inside_course(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	data = {}
	box =  html_parsed.find('div', {'class': 'course_box'})
	professors = box.find('h5')
	professores = []
	for professor in professors.contents[0].split(','):
		professores.append(professor.replace('&mdash;', '').replace('&nbsp;', '').strip())
	#print professores
	data['professors'] = professores
	try:
		rating = box.find('div', {'class': 'score'}).contents[0]
		data['rating'] = rating
	except AttributeError:
		data['rating'] = ''
	#print rating
	difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0]
	data['difficulty'] = difficulty
	#print difficulty
	workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0]
	data['workload'] = workload
	try:
		workload = re.search(r'<(.*?)>(.*?)</(.*?)>', str(workload)).group(2)
		data['workload'] = unicode(workload)
	except AttributeError:
		pass

	#print workload
	topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'})
	topics = []
	for topic in topics_l:
		topics.append(topic.contents[0])
	#print topics
	data['topics'] = topics
	data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src'])
	#print description
	print data
	return data

def crawl_course(url):
	print url
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	table  = html_parsed.find('table', {'class': 'table course_list'})
	for course in table.findAll('tr')[1:]:
		c = {}
		if course.findAll('td')[0].find('a'):
			c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '')
		else:
			c['provider'] = None

		c['slug'] = course.findAll('td')[1].find('a')['href']
		c['name'] = course.findAll('td')[1].find('a').contents[0]
		if len(course.findAll('td')[1].findAll('a')) > 1:
			c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0]
		else:
			c['university'] = None
		data = crawl_inside_course(URL_BASE + c['slug'])
		c['difficulty'] = data['difficulty']
		c['rating'] = data['rating']
		c['topics'] = data['topics']
		c['description'] = data['description']
		c['professors'] = data['professors']
		c['workload'] = data['workload']

		yield c

def crawl_reviews(url, client):
	total = 0
	pg = 1
	raw_html = client.get(url + '/?page=1').text
	html_parsed = BeautifulSoup(raw_html)
	try:
		pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0]
		pages = int(re.search(r'of ([\d]+)', pages).group(1))
	except AttributeError:
		if 'No reviews yet' in raw_html:
			pages = 0
		else:
			print 'somente 1 pagina'
			pages = 1

	autores = []
	ratings = []
	while pg <= pages:
		for review in html_parsed.findAll('tr', {"class": 'review-tr'}):
			try:
				author = review.find('a')['href'].replace('/u/', '')
			except KeyError:
				author = 'anonymous'

			stars = review.find('div', {'class': re.compile('stars')})
			autores.append(author)
			ratings.append(stars2ratings[stars['class']])
			total +=1
		pg+=1
		html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text)

	print 'total crawleado de reviews', total
	return autores, ratings

def crawl_courses(client):
	total = 0
	id_curso = 1
	id_user = 1
	users_set = {}
	for pg in range(1, 113):
		url = URL_BASE + '?page=%d' % pg
		for course in crawl_course(url):
			print course
			cursos.write('%d|%s|%s|%s|%s|%s|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']),
			 	remover_acentos(course['university']), course['difficulty'], course['provider']))
			for tag in course['topics']:
				tags.write('%d|%s\n' % (id_curso, tag))
			for professor in course['professors']:
				professors.write('%d|%s\n' % (id_curso, remover_acentos(professor)))

			autores, rt = crawl_reviews(URL_BASE + course['slug'], client)
			for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)):
				if autor not in users_set:
					users_set[autor] = id_user
					id_user +=1
				users.write('%d|%s\n' % (users_set[autor], autor))
				ratings.write('%d|%d|%s\n' % (users_set[autor], curso, nota))
				if autor == 'anonymous':
					del users_set[autor]
			id_curso +=1


		total +=1
	print 'total de paginas importados: ', total
	print 'total de cursos importados ', id_curso
	cursos.close()
	users.close()
	ratings.close()
	professors.close()
	tags.close()

if __name__ == '__main__':
		global client
		create_files()
		client = session_login()
		crawl_courses(client)
		#crawl_inside_course(URL_BASE + '/coursera/pre-calculus')
	import urllib2
	import urllib
	from BeautifulSoup import BeautifulSoup
	import cookielib
	import requests
	import re

	URL_BASE = 'http://coursetalk.org/'
	cursos = None
	ratings = None
	users = None
	tags = None
	professors = None
	client = None

	stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0',
	'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5',
	'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0',
	'stars s1': '0.5', 'stars s0': '0.0'}

	def session_login():
	global client
	import sys
	import requests

	URL = 'http://coursetalk.org/login'

	client = requests.session()

	# Retrieve the CSRF token first
	client.get(URL) # sets cookie
	csrftoken = client.cookies['csrftoken']

	login_data = dict(email='caraciol@gmail.com', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/')
	client.post(URL, data=login_data, headers=dict(Referer=URL))

	return client

	from unicodedata import normalize
	def remover_acentos(txt):
	if txt:
	return normalize('NFKD', txt).encode('ASCII','ignore')
	else:
	return ''

	def create_files():
	global cursos, ratings, users, tags, professors
	cursos = open('cursos.dat', 'w')
	ratings = open('ratings.dat', 'w')
	users = open('users.dat', 'w')
	tags = open('course-tags.dat', 'w')
	professors = open('course-professors.dat', 'w')


	def crawl_description(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	description = html_parsed.find('p')
	return description.contents[0]


	def crawl_inside_course(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	data = {}
	box = html_parsed.find('div', {'class': 'course_box'})
	professors = box.find('h5')
	professores = []
	for professor in professors.contents[0].split(','):
	professores.append(professor.replace('—', '').replace(' ', '').strip())
	#print professores
	data['professors'] = professores
	try:
	rating = box.find('div', {'class': 'score'}).contents[0]
	data['rating'] = rating
	except AttributeError:
	data['rating'] = ''
	#print rating
	difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0]
	data['difficulty'] = difficulty
	#print difficulty
	workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0]
	data['workload'] = workload
	try:
	workload = re.search(r'<(.?)>(.?)</(.*?)>', str(workload)).group(2)
	data['workload'] = unicode(workload)
	except AttributeError:
	pass

	#print workload
	topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'})
	topics = []
	for topic in topics_l:
	topics.append(topic.contents[0])
	#print topics
	data['topics'] = topics
	data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src'])
	#print description
	print data
	return data

	def crawl_course(url):
	print url
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	table = html_parsed.find('table', {'class': 'table course_list'})
	for course in table.findAll('tr')[1:]:
	c = {}
	if course.findAll('td')[0].find('a'):
	c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '')
	else:
	c['provider'] = None

	c['slug'] = course.findAll('td')[1].find('a')['href']
	c['name'] = course.findAll('td')[1].find('a').contents[0]
	if len(course.findAll('td')[1].findAll('a')) > 1:
	c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0]
	else:
	c['university'] = None
	data = crawl_inside_course(URL_BASE + c['slug'])
	c['difficulty'] = data['difficulty']
	c['rating'] = data['rating']
	c['topics'] = data['topics']
	c['description'] = data['description']
	c['professors'] = data['professors']
	c['workload'] = data['workload']

	yield c

	def crawl_reviews(url, client):
	total = 0
	pg = 1
	raw_html = client.get(url + '/?page=1').text
	html_parsed = BeautifulSoup(raw_html)
	try:
	pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0]
	pages = int(re.search(r'of ([\d]+)', pages).group(1))
	except AttributeError:
	if 'No reviews yet' in raw_html:
	pages = 0
	else:
	print 'somente 1 pagina'
	pages = 1

	autores = []
	ratings = []
	while pg <= pages:
	for review in html_parsed.findAll('tr', {"class": 'review-tr'}):
	try:
	author = review.find('a')['href'].replace('/u/', '')
	except KeyError:
	author = 'anonymous'

	stars = review.find('div', {'class': re.compile('stars')})
	autores.append(author)
	ratings.append(stars2ratings[stars['class']])
	total +=1
	pg+=1
	html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text)

	print 'total crawleado de reviews', total
	return autores, ratings

	def crawl_courses(client):
	total = 0
	id_curso = 1
	id_user = 1
	users_set = {}
	for pg in range(1, 113):
	url = URL_BASE + '?page=%d' % pg
	for course in crawl_course(url):
	print course
	cursos.write('%d\|%s\|%s\|%s\|%s\|%s\|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']),
	remover_acentos(course['university']), course['difficulty'], course['provider']))
	for tag in course['topics']:
	tags.write('%d\|%s\n' % (id_curso, tag))
	for professor in course['professors']:
	professors.write('%d\|%s\n' % (id_curso, remover_acentos(professor)))

	autores, rt = crawl_reviews(URL_BASE + course['slug'], client)
	for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)):
	if autor not in users_set:
	users_set[autor] = id_user
	id_user +=1
	users.write('%d\|%s\n' % (users_set[autor], autor))
	ratings.write('%d\|%d\|%s\n' % (users_set[autor], curso, nota))
	if autor == 'anonymous':
	del users_set[autor]
	id_curso +=1


	total +=1
	print 'total de paginas importados: ', total
	print 'total de cursos importados ', id_curso
	cursos.close()
	users.close()
	ratings.close()
	professors.close()
	tags.close()

	if __name__ == '__main__':
	global client
	create_files()
	client = session_login()
	crawl_courses(client)
	#crawl_inside_course(URL_BASE + '/coursera/pre-calculus')