Skip to content

Instantly share code, notes, and snippets.

@sousatg
Created August 28, 2017 06:46
Show Gist options
  • Save sousatg/02f3c41faaa728fe2baba543443f0c9e to your computer and use it in GitHub Desktop.
Save sousatg/02f3c41faaa728fe2baba543443f0c9e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# http://assicanti.pt/category/uptec/
import PyPDF2
import re
from itertools import takewhile
def ementa(data):
if len(data) == 0:
return "a"
head, tail = data[0], data[1:]
if head in ['SEGUNDA', 'TERÇA']:
print head
ementa(tail)
pdfFileObj = open('ement.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
text = re.sub(' +', ' ', pageObj.extractText())
text = re.sub('\s{2,}', ' ', text)
text = re.sub('[\r\n\t]', '', text)
# separa todos os elementos da string
data = re.split('\s(?=[A-Z]{2,})', text.strip())
ementa(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment