Skip to content

Instantly share code, notes, and snippets.

@danilovazb
Created November 30, 2013 15:47
Show Gist options
  • Save danilovazb/7720655 to your computer and use it in GitHub Desktop.
Save danilovazb/7720655 to your computer and use it in GitHub Desktop.
Extrai CSV do IBGE
/****************************************
* Import de biblioteca para o projeto
****************************************/
import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class downloadCSV {
/******************************************************
* Metodo 1 - getLinksUF
* - Pega as URLs que contenha uf.php? na pagina principal do IBGE Cidades
******************************************************/
public static void getLinksUF(String URL) throws IOException {
/****************************************************
* Variaveis para gravação de log
*/
FileWriter f = new FileWriter("logs.txt", true);
PrintWriter logMetd1 = new PrintWriter(f);
//***************************************************
Document doc = Jsoup.connect(URL).get();
Elements urlPesquisa = doc.select("a[href]");
for (Element urlUF : urlPesquisa) {
if (urlUF.attr("href").contains("uf.php?")
&& !urlUF.attr("href").contains("home.php?lang=_EN")
&& !urlUF.attr("href").contains("home.php?lang=_ES")
&& !urlUF.attr("href").contains("home.php?lang=")
&& !urlUF.attr("href").contains("index.php?lang="))
/*
* logMerd1 grava logs
*/
System.out.println("*********************************************************************************");
logMetd1.write("*********************************************************************************");
System.out.println("Metodo 1 ---> " + urlUF.attr("abs:href"));
logMetd1.write("Metodo 1 ---> " + urlUF.attr("abs:href"));
getLinksCidades(urlUF.attr("abs:href"));
System.out.println("*********************************************************************************");
logMetd1.write("*********************************************************************************");
}
}
/******************************************************
* Metodo 2 - getLinksUF
* - Pega as URLs que contenha perfil.php? na pagina de UF do IBGE Cidades
* - Elimina paginas que tenha lang=_ e /estadosat/perfil.php?lang=&sigla=
******************************************************/
public static void getLinksCidades(String URLUF) throws IOException {
/****************************************************
* Variaveis para gravação de log
*/
FileWriter f = new FileWriter("logs.txt", true);
PrintWriter logMetd2 = new PrintWriter(f);
//***************************************************
try{
Document doc = Jsoup.connect(URLUF).get();
Elements urlPesquisa = doc.select("a[href]");
for (Element linkCid : urlPesquisa) {
if (linkCid.attr("href").contains("perfil.php?")
&& !linkCid.attr("href").contains("lang=_")
&& !linkCid.attr("href").contains("/estadosat/perfil.php?lang=&sigla=")
&& !linkCid.attr("href").contains("home.php?lang=_EN")
&& !linkCid.attr("href").contains("home.php?lang=_ES")
&& !linkCid.attr("href").contains("home.php?lang=")
&& !linkCid.attr("href").contains("index.php?lang="))
System.out.println("Metodo 2 ---> " + linkCid.attr("abs:href"));
getLinksDados(linkCid.attr("abs:href"));
/*
* Grava logs de saída de comandos do módulo 3
*/
logMetd2.write("Metodo 2 ---> " + linkCid.attr("abs:href"));
}
}catch (SocketTimeoutException e) {
}
}
/******************************************************
* Metodo 3 - getLinksUF
* - Pega as URLs que contenha temas.php?lang=&codmun= e &idtema=16&search=
******************************************************/
public static void getLinksDados(String URLCID) throws IOException {
/****************************************************
* Variaveis para gravação dos arquivos .csv
*/
InputStream is = null;
BufferedInputStream buf = null;
FileOutputStream grava = null;
/****************************************************
* Variaveis para gravação de log
*/
FileWriter f = new FileWriter("logs.txt", true);
PrintWriter logMetd3 = new PrintWriter(f);
//***************************************************
try{
Document doc = Jsoup.connect(URLCID).get();
Elements urlPesquisa = doc.select("a[href]");
Elements titulo = doc.select(".csv");
Elements estado = doc.select(".uf");
Elements valor = doc.select("span[class=municipio titulo]");
Elements linkSintese = doc.select("li.sintese");
for (Element link : titulo) {
if (link.attr("href").contains("csv.php?lang=&idtema=16&codmun=")
&& !link.attr("href").contains("lang=_")
&& !link.attr("href").contains("/estadosat/perfil.php?lang=&sigla=")
&& !link.attr("href").contains("help.php?lang=")
&& !link.attr("href").contains("download/mapa_e_municipios.php?")
&& !link.attr("href").contains("/webcart")
&& !link.attr("href").contains("/home.php?lang=")
&& !link.attr("href").contains("/index.php?lang=")
&& !link.attr("href").contains("home.php?lang=_EN")
&& !link.attr("href").contains("home.php?lang=_ES")
&& !link.attr("href").contains("home.php?lang=")
&& !link.attr("href").contains("index.php?lang="))
System.out.println("Metodo 3 ---> " + link.attr("abs:href")+"\n\nEstado: " + estado.text() + "\nCidade: " + valor.text() + "\nDocumento: " + link.text() + "\nLink Download: "+ link.attr("abs:href"));
if (link.attr("href").contains("csv.php?lang=&idtema=16&codmun=")){
URL url = new URL(link.attr("abs:href"));
url.getHost();
url.getFile();
url.getPort();
url.getUserInfo();
URLConnection con = url.openConnection();
buf = new BufferedInputStream(con.getInputStream());
grava = new FileOutputStream("C:\\Users\\unknown\\Desktop\\Imagem\\" + estado.text() + " - " + valor.text() + " - " + link.text() + ".csv");
int i = 0;
byte[] bytesIn = new byte[1024];
while ((i = buf.read(bytesIn)) >= 0) {
grava.write(bytesIn, 0, i);
}
if (buf != null) {
buf.close();
}
if (grava != null) {
grava.close();
}
/*
* Grava logs de saída de comandos do módulo 3
*/
logMetd3.write("Metodo 3 ---> " + link.attr("abs:href"));
}}
}catch (SocketTimeoutException e) {
}
}
public static void getDados(String URLLD) throws IOException {
/****************************************************
* Variaveis para gravação de log
*/
FileWriter f = new FileWriter("logs.txt", true);
PrintWriter logMetd4 = new PrintWriter(f);
//***************************************************
try{
Document doc = Jsoup.connect(URLLD).get();
Elements urlPesquisa = doc.select("a[href]");
for (Element link : urlPesquisa) {
if (link.attr("href").contains("csv.php?lang=")
&& link.attr("href").contains("&idtema=16&search=")
&& !link.attr("href").contains("lang=_")
&& !link.attr("href").contains("/estadosat/perfil.php?lang=&sigla=")
&& !link.attr("href").contains("help.php?lang=")
&& !link.attr("href").contains("download/mapa_e_municipios.php?")
&& !link.attr("href").contains("/webcart")
&& !link.attr("href").contains("/home.php?lang=")
&& !link.attr("href").contains("/index.php?lang=")
&& !link.attr("href").contains("home.php?lang=_EN")
&& !link.attr("href").contains("home.php?lang=_ES")
&& !link.attr("href").contains("home.php?lang=")
&& !link.attr("href").contains("index.php?lang="))
System.out.println("Metodo 4 ---> " + link.attr("abs:href"));
/*
* Grava logs de saída de comandos do módulo 3
*/
logMetd4.write("Metodo 4 ---> " + link.attr("abs:href"));
}
}catch (SocketTimeoutException e) {
}
}
/******************************************************
* Metodo Main do programa
* @throws IOException
******************************************************/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
getLinksUF("http://cidades.ibge.gov.br/xtras/home.php");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment