sousatg/apostilando_scrapper.py

## apostilando_scrapper.py
#!/usr/bin/env/python
# -*- coding: utf-8 -*-

import requests
import uuid
import os
import csv
from lxml import etree
import time

siteSetctions = [
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=1',
		'categorias' : 'webdesign, asp'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=5',
		'categorias' : 'webdesign, html'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=8',
		'categorias' : 'webdesign, php'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=26',
		'categorias' : 'webdesign, mysql'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=9',
		'categorias' : 'webdesign, xml'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=3',
		'categorias' : 'webdesign, css'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=2',
		'categorias' : 'webdesign, cgi, perl'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=6',
		'categorias' : 'webdesign, java'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=7',
		'categorias' : 'webdesign, javascript'
	},
	{
		'url' : 'http://www.apostilando.com/sessao.php?cod=25',
		'categorias' : 'webdesign, diversas'
	}
]

def login(s, username, senha):
	# Fazer login no programa
	login_url = 'http://www.apostilando.com/login.php'
	r = s.get( login_url )

	payload = {
		'cod_apostila' : 0,
		'txtUser' : login,
		'txtSenha' : senha,
		'btnLogar' : 'Login+%3E%3E'
	}
	r = s.post('http://www.apostilando.com/autentica.php', data=payload)
	print r.status_code

	return s


# Percorre uma lista de secções do website
def download_section(sectionList):
	if len( sectionList ) == 0:
		return []

	section = sectionList.pop()

	# get list of books in a page
	r = s.get( section['url'] )
	page = etree.HTML( r.content )

	apostilas = page.xpath('//div[@class="features_items"]//div[@class="media commnets"]')
	apostilas = map( parse_apostila, apostilas )

	section['apostilas'] = apostilas

	return [ section ] + download_section( sectionList )


def parse_apostila( apostila ):
	try:
		titulo = apostila.xpath('.//h2/a/text()')[0]
		download_link = apostila.xpath('.//h2/a/@href')[0]
		descricao = apostila.xpath('./div/p/text()')[0]

		nota = ""
		n_paginas = ""
		data = ""

		return [titulo, download_link, descricao]
	except:
		return []

if __name__ == '__main__':
	login = ''
	senha = ''

	s = requests.Session()
	s = login(s, login, senha)
	a = download_section( siteSetctions )
	print a
	#!/usr/bin/env/python
	# -- coding: utf-8 --

	import requests
	import uuid
	import os
	import csv
	from lxml import etree
	import time

	siteSetctions = [
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=1',
	'categorias' : 'webdesign, asp'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=5',
	'categorias' : 'webdesign, html'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=8',
	'categorias' : 'webdesign, php'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=26',
	'categorias' : 'webdesign, mysql'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=9',
	'categorias' : 'webdesign, xml'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=3',
	'categorias' : 'webdesign, css'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=2',
	'categorias' : 'webdesign, cgi, perl'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=6',
	'categorias' : 'webdesign, java'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=7',
	'categorias' : 'webdesign, javascript'
	},
	{
	'url' : 'http://www.apostilando.com/sessao.php?cod=25',
	'categorias' : 'webdesign, diversas'
	}
	]

	def login(s, username, senha):
	# Fazer login no programa
	login_url = 'http://www.apostilando.com/login.php'
	r = s.get( login_url )

	payload = {
	'cod_apostila' : 0,
	'txtUser' : login,
	'txtSenha' : senha,
	'btnLogar' : 'Login+%3E%3E'
	}
	r = s.post('http://www.apostilando.com/autentica.php', data=payload)
	print r.status_code

	return s


	# Percorre uma lista de secções do website
	def download_section(sectionList):
	if len( sectionList ) == 0:
	return []

	section = sectionList.pop()

	# get list of books in a page
	r = s.get( section['url'] )
	page = etree.HTML( r.content )

	apostilas = page.xpath('//div[@class="features_items"]//div[@class="media commnets"]')
	apostilas = map( parse_apostila, apostilas )

	section['apostilas'] = apostilas

	return [ section ] + download_section( sectionList )


	def parse_apostila( apostila ):
	try:
	titulo = apostila.xpath('.//h2/a/text()')[0]
	download_link = apostila.xpath('.//h2/a/@href')[0]
	descricao = apostila.xpath('./div/p/text()')[0]

	nota = ""
	n_paginas = ""
	data = ""

	return [titulo, download_link, descricao]
	except:
	return []

	if __name__ == '__main__':
	login = ''
	senha = ''

	s = requests.Session()
	s = login(s, login, senha)
	a = download_section( siteSetctions )
	print a