Skip to content

Instantly share code, notes, and snippets.

@sousatg
Last active February 14, 2016 11:56
Show Gist options
  • Save sousatg/92f6d036fd52068b6097 to your computer and use it in GitHub Desktop.
Save sousatg/92f6d036fd52068b6097 to your computer and use it in GitHub Desktop.
Python scrapper to download ebooks from apostilando.com.br
#!/usr/bin/env/python
# -*- coding: utf-8 -*-
import requests
import uuid
import os
import csv
from lxml import etree
import time
siteSetctions = [
{
'url' : 'http://www.apostilando.com/sessao.php?cod=1',
'categorias' : 'webdesign, asp'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=5',
'categorias' : 'webdesign, html'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=8',
'categorias' : 'webdesign, php'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=26',
'categorias' : 'webdesign, mysql'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=9',
'categorias' : 'webdesign, xml'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=3',
'categorias' : 'webdesign, css'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=2',
'categorias' : 'webdesign, cgi, perl'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=6',
'categorias' : 'webdesign, java'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=7',
'categorias' : 'webdesign, javascript'
},
{
'url' : 'http://www.apostilando.com/sessao.php?cod=25',
'categorias' : 'webdesign, diversas'
}
]
def login(s, username, senha):
# Fazer login no programa
login_url = 'http://www.apostilando.com/login.php'
r = s.get( login_url )
payload = {
'cod_apostila' : 0,
'txtUser' : login,
'txtSenha' : senha,
'btnLogar' : 'Login+%3E%3E'
}
r = s.post('http://www.apostilando.com/autentica.php', data=payload)
print r.status_code
return s
# Percorre uma lista de secções do website
def download_section(sectionList):
if len( sectionList ) == 0:
return []
section = sectionList.pop()
# get list of books in a page
r = s.get( section['url'] )
page = etree.HTML( r.content )
apostilas = page.xpath('//div[@class="features_items"]//div[@class="media commnets"]')
apostilas = map( parse_apostila, apostilas )
section['apostilas'] = apostilas
return [ section ] + download_section( sectionList )
def parse_apostila( apostila ):
try:
titulo = apostila.xpath('.//h2/a/text()')[0]
download_link = apostila.xpath('.//h2/a/@href')[0]
descricao = apostila.xpath('./div/p/text()')[0]
nota = ""
n_paginas = ""
data = ""
return [titulo, download_link, descricao]
except:
return []
if __name__ == '__main__':
login = ''
senha = ''
s = requests.Session()
s = login(s, login, senha)
a = download_section( siteSetctions )
print a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment