Skip to content

Instantly share code, notes, and snippets.

@fmasanori
Last active July 10, 2017 02:26
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fmasanori/be88f2a3e1526a07d01c392bde49d902 to your computer and use it in GitHub Desktop.
Save fmasanori/be88f2a3e1526a07d01c392bde49d902 to your computer and use it in GitHub Desktop.
Juizes que mais atuaram segundo o site da CBF
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
html = urlopen("http://www.cbf.com.br/competicoes/brasileiro-serie-a/tabela/2016#.WQTfO9orLIV")
bsObj = BeautifulSoup(html.read(), 'html.parser')
nameList = bsObj.findAll('div', class_="full-game-links")
sumulas = []
for name in nameList:
res = name.findAll('a')
if 'retificacoes' in str(res[0]):
res = str(res[2])
else:
res = str(res[0])
j = res.find('url[]=')
k = res.find('.pdf')
sumula = res[j+6:k+4]
if len(sumula) > 0:
sumulas.append(sumula)
arquivos = []
for sumula in sumulas:
j = sumula.find('2016')
f = open(sumula[j+5:], 'wb')
arquivos.append(sumula[j+5:])
print (sumula[j+5:])
dados = urlopen(sumula).read()
f.write(dados)
f.close()
antes_juiz = 'Quarto Arbitro:\n\n'
wc = {}
for arquivo in arquivos:
pdfFile = open(arquivo, 'rb')
res = readPDF(pdfFile)
if 'Delegado Local:\n\n' in res:
antes_juiz = 'Delegado Local:\n\n'
elif 'Delegado Especial:\n\n' in res:
antes_juiz = 'Delegado Especial:\n\n'
if 'Inspetor:\n\n' in res:
antes_juiz = 'Inspetor:\n\n'
if 'Assessor:\n\n' in res:
antes_juiz = 'Assessor:\n\n'
if 'Tutor:\n\n' in res:
antes_juiz = 'Tutor:\n\n'
j = res.find(antes_juiz)
j = j + len(antes_juiz)
k = res.find('\n\n',j) #depois
juiz = res[j:k]
print (juiz)
if juiz in wc:
wc [juiz] += 1
else:
wc [juiz] = 1
pdfFile.close()
antes_juiz = 'Quarto Arbitro:\n\n'
print ('\nJuízes que mais atuaram')
def atuações(dupla):
return dupla[1]
csvFile = open("juizes16.csv", 'wt', newline='', encoding='utf-8')
writer = csv.writer(csvFile)
writer.writerow(['Juiz', 'Atuações'])
duplas = sorted(wc.items(), key=atuações, reverse=True)
for dupla in duplas:
print (dupla[0], dupla[1])
writer.writerow(dupla)
csvFile.close()
@faelp22
Copy link

faelp22 commented May 5, 2017

Olá professor achei muito interessante esse trabalho.
Outra coisa fiz uma pequena mudança porque tentei executar e estava dando error:

TypeError: unicode argument expected, got 'str' 
File "crawler.py", line 52, in <module> res = readPDF(pdfFile)

sendo assim troquei o importe por: from io import BytesIO as StringIO
estou usando Python 2.7.12

@fmasanori
Copy link
Author

Esqueci de falar que só uso Python 3, sorry.

@caiaffa
Copy link

caiaffa commented Jun 5, 2017

Qual comando vc instalou o pdfminer para python 3?

abraços

@fmasanori
Copy link
Author

pip install pdfminer3k

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment