souenzzo/coletor.py

## coletor.py
#!/usr/bin/env python3

from http import client
from bs4 import BeautifulSoup

def getElems(tr):
    res = []
    for th in tr:
        if th.string != "\n":
            res.append(th.string)
    return res

def structuralize(table):
    struct = []
    titles = getElems(table.thead.tr)
    for elem in table.tbody:
        if elem != "\n":
            coisa = getElems(elem)
            struct.append(dict(zip(titles,coisa)))
    return struct

def getInfo(soup):
    table = soup.find(id="table_disciplinas")
    return structuralize(table)

def nextPage(soup):
    pag = soup.find_all("a", "next_page")
    if pag == []:
        return False
    return True

def get_data(periodo, curso):

    conn = client.HTTPSConnection("sistemas.uff.br")
    BASE='/transparencia/disciplinas_que_mais_reprovam'
    page = 1

    data = { "periodo": periodo,
             "curso": curso,
             "disciplinas": [] }

    while True:
        PATH="%s?page=%d&periodo=%d&curso=%d"%(BASE,page,periodo,curso)
        conn.request("GET", PATH)
        res = conn.getresponse()
        raw = res.read()
        soup = BeautifulSoup(raw, "html.parser")
        info = getInfo(soup)
        data["disciplinas"].extend(info)
        page = page + 1
        if not nextPage(soup):
            break
    return data

if __name__ == "__main__":
    db = { "nome": "disciplinas",
           "data": []
    }
    for ano in range(1999, 2017): ## anos
        for semestre in [1,2]: ## Semestres
            for curso in range(200): ## cursos
                try:
                    periodo = (ano * 10) + semestre
                    x = get_data(20152, 25)
                except:
                    pass
                else:
                    db["data"].append(x)
	#!/usr/bin/env python3

	from http import client
	from bs4 import BeautifulSoup

	def getElems(tr):
	res = []
	for th in tr:
	if th.string != "\n":
	res.append(th.string)
	return res

	def structuralize(table):
	struct = []
	titles = getElems(table.thead.tr)
	for elem in table.tbody:
	if elem != "\n":
	coisa = getElems(elem)
	struct.append(dict(zip(titles,coisa)))
	return struct

	def getInfo(soup):
	table = soup.find(id="table_disciplinas")
	return structuralize(table)

	def nextPage(soup):
	pag = soup.find_all("a", "next_page")
	if pag == []:
	return False
	return True

	def get_data(periodo, curso):

	conn = client.HTTPSConnection("sistemas.uff.br")
	BASE='/transparencia/disciplinas_que_mais_reprovam'
	page = 1

	data = { "periodo": periodo,
	"curso": curso,
	"disciplinas": [] }

	while True:
	PATH="%s?page=%d&periodo=%d&curso=%d"%(BASE,page,periodo,curso)
	conn.request("GET", PATH)
	res = conn.getresponse()
	raw = res.read()
	soup = BeautifulSoup(raw, "html.parser")
	info = getInfo(soup)
	data["disciplinas"].extend(info)
	page = page + 1
	if not nextPage(soup):
	break
	return data

	if __name__ == "__main__":
	db = { "nome": "disciplinas",
	"data": []
	}
	for ano in range(1999, 2017): ## anos
	for semestre in [1,2]: ## Semestres
	for curso in range(200): ## cursos
	try:
	periodo = (ano * 10) + semestre
	x = get_data(20152, 25)
	except:
	pass
	else:
	db["data"].append(x)