Skip to content

Instantly share code, notes, and snippets.

@DiegoQueiroz
Created May 4, 2015 00:52
Show Gist options
  • Save DiegoQueiroz/e03873b33208b5356849 to your computer and use it in GitHub Desktop.
Save DiegoQueiroz/e03873b33208b5356849 to your computer and use it in GitHub Desktop.
Override HTTP redirect behavior to CNPq Lattes
# -*- coding: utf-8 -*-
import sys, re
if sys.version_info.major == 3:
# usando Python 3
from urllib.request import Request, build_opener, HTTPRedirectHandler, HTTPCookieProcessor
else:
# usando Python 2
from urllib2 import Request, build_opener, HTTPRedirectHandler, HTTPCookieProcessor
# as solicitações ao Lattes fazem uso de diversos redirecionamentos HTTP internos
# essa classe altera o comportamento do redirect quando ele possuir o campo
# "metodo=apresentar" para contornar a necessidade do captcha
class LattesHTTPRedirectHandler(HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
newurl = re.sub('(?<=[?&]metodo=)apresentar', 'captchaValido', newurl)
return super(LattesHTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl)
if __name__ == "__main__":
txdata = None
txheaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0',
#
# *********** pelo que pude identificar, as linhas que comentei abaixo são
# *********** desnecessárias para esse script funcionar
# *********** As cookies em particular, só confundem o servidor. Sugiro remover.
#
#'Accept-Language': 'en-us,en;q=0.5',
#'Accept-Encoding': 'deflate',
#'Keep-Alive': '115',
#'Connection': 'keep-alive',
#'Cache-Control': 'max-age=0',
#'Cookie': 'style=standard; __utma=140185953.294397416.1313390179.1313390179.1317145115.2; __utmz=140185953.1317145115.2.2.utmccn=(referral)|utmcsr=emailinstitucional.cnpq.br|utmcct=/ei/emailInstitucional.do|utmcmd=referral; JSESSIONID=1B98ABF9642E01597AABA0F7A8807FD1.node2',
}
url = 'http://lattes.cnpq.br/4727357182510680'
req = Request(url, txdata, txheaders)
# para funcionar direito, também é necessário ativar o HTTPCookieProcessor,
# ou as cookies são perdidas entre os redirects e o script se torna instável
# notar que o build_opener recebe INSTÂNCIAS de classes
lattesOpener = build_opener(LattesHTTPRedirectHandler(), HTTPCookieProcessor())
arquivoH = lattesOpener.open(req)
cvLattesHTML = arquivoH.read()
arquivoH.close()
print(cvLattesHTML)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment