Skip to content

Instantly share code, notes, and snippets.

@phoemur
Last active June 13, 2021 01:58
Show Gist options
  • Save phoemur/4f05bd9900578e48a76d to your computer and use it in GitHub Desktop.
Save phoemur/4f05bd9900578e48a76d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import urllib.request
import urllib.parse
import http.cookiejar
import os
from lxml.html import fragment_fromstring
from collections import OrderedDict
def remove_disallowed_filename_chars(filename):
corrected_file = "".join([x if x.isalnum() else "_" for x in filename])
while "__" in corrected_file:
corrected_file = corrected_file.replace("__", "_")
return corrected_file
def get_lista(*args, **kwargs):
'''
Retorna todos os FII listados na BVMF
'''
url = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListados.aspx?tipoFundo=imobiliario&Idioma=pt-br'
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
with opener.open(url) as ur:
content = ur.read().decode('UTF-8')
pattern = re.compile('<table>.*</table>', re.DOTALL)
reg = re.findall(pattern, content)[0]
page = fragment_fromstring(reg)
lista = OrderedDict()
for row in page.xpath('tr'):
lista.update({row.findall('td')[3].getchildren()[0].text : 'http://www2.bmfbovespa.com.br/Fundos-Listados/' + row.findall('td')[0].getchildren()[0].items()[1][1]})
return lista
def get_files(fii, link):
'''
Faz o Download de todos os arquivos disponiveis do FII selecionado
'''
COMUNICADOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=abaPrincipal'.format(fii.upper())
RELATORIOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaDemonstracoesFinanceiras&idioma=pt-br'.format(fii.upper())
OUTROS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaOutrosDocumentos&idioma=pt-br'.format(fii.upper())
os.makedirs(fii, exist_ok=True)
os.chdir(fii)
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
for pagina in [COMUNICADOS, RELATORIOS, OUTROS]:
with opener.open(pagina) as ur:
content = ur.read().decode('UTF-8')
pattern = re.compile('tbArqListados.*(<table>.*</table>)', re.DOTALL)
reg = re.findall(pattern, content)[0]
page = fragment_fromstring(reg)
lista = list()
for row in page.xpath('tr'):
lista.append((remove_disallowed_filename_chars(row.findall('td')[0].getchildren()[0].text) + '.PDF' ,
row.findall('td')[0].getchildren()[0].items()[2][1]))
for filename, link in lista:
if filename not in os.listdir('.'):
print('Downloading {}'.format(filename))
with urllib.request.urlopen(link.replace('http://www.bmfbovespa', 'http://www2.bmfbovespa')) as ur:
content = ur.read()
with open(filename, mode='wb') as fh:
fh.write(content)
else:
print('{} já está salvo no diretório'.format(filename))
os.chdir('..')
if __name__ == '__main__':
import sys
if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}:
print('Modo de uso: {0} "FII[1]" "FII[2]" ... "FII[N]"'.format(sys.argv[0]))
sys.exit(1)
lista = get_lista()
for arg in sys.argv[1:]:
for fii, link in lista.items():
if re.search(fii, arg.upper()):
get_files(fii, link)
@phoemur
Copy link
Author

phoemur commented Mar 19, 2016

Este script faz o download de todos os arquivos disponíveis sobre determinado Fundo de Investimento Imobiliário listado na BVMF.
Modo de uso: python fii.py FII[1] FII[2] ... FII[N]

Se o arquivo já estiver salvo ele não faz o download, só baixa os arquivos novos. Necessita da dependência lxml instalada:
pip install lxml

@felipemaion
Copy link

Fala @phoemur, tem atualização? Aqui o 'reg' (linha 62) está vindo vazio (teste "RNGO11" "BRCR11")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment