Skip to content

Instantly share code, notes, and snippets.

@alanboy
Created July 30, 2014 07:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alanboy/10fcfc4c7a229e0bbd2e to your computer and use it in GitHub Desktop.
Save alanboy/10fcfc4c7a229e0bbd2e to your computer and use it in GitHub Desktop.
Web Crawler
import java.net.*;
import java.io.*;
import java.util.Date;
import java.util.regex.*;
import java.util.*;
import java.sql.*;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
class Main{
static String getContent( String url ) throws Exception {
URL hp = new URL( url );
URLConnection hpCon = hp.openConnection();
InputStream input = null;
try{
input = hpCon.getInputStream();
}catch(IOException ioe){
return ioe.toString() ;
}
String contenido = "";
int c;
while((c = input.read()) != -1)
contenido += (char)c;
return( contenido );
}//metodo
/*
* @return ArrayList<String>
* @autor alanboy
*
*/
public static ArrayList<String> parseLinks( URL url, String html ){
//buscar <a ... href = "" ... > ... </a>
Matcher m = Pattern.compile("\\<a.*?href.*?>.*?</a>").matcher( html );
ArrayList<String> links = new ArrayList<String>();
while( m.find() ){
String link = m.group();
//buscar a la propiedad href
Matcher ml = Pattern.compile("href *?= *?\".*?\"").matcher( link );
while ( ml.find() ){
String href = ml.group();
href = href.substring ( href.indexOf("\"") + 1, href.length() - 1 );
//buscar por hrefs vacios
if(href.length() == 0)
continue;
try{
href = new URL( href ).toExternalForm();
}catch(MalformedURLException mue){
try{
URL relativeURL;
relativeURL = new URL ( url, href);
href = relativeURL.toExternalForm ();
}catch(MalformedURLException mue2){
//esto no es un hipervinculo definitivamente
continue;
}
}
links.add( href );
}//while
}//while
return links;
}//metodo
public static void main(String [] args) throws Exception{
Conexion a = new Conexion();
for(int i = 0 ; i < 50 ; i++){
//siguiente url
System.out.print("siguiente url...");
ResultSet rs = a.query("SELECT docId, url FROM Spider WHERE fecha IS NULL LIMIT 1");
rs.next();
String url = rs.getString("url");
String docId = rs.getString("docId");
System.out.println("[ok]");
System.out.println("url: "+ url);
//get raw html code from url
System.out.print("getContent...");
String html = getContent( url );
System.out.println("[ok]");
//get links from that html code
System.out.print("parseLinks...");
ArrayList<String> links = parseLinks( new URL( url ), html );
System.out.println("[ok]");
for( String s : links )
a.update("INSERT INTO `Buscador`.`Spider` (`url`) VALUES ('"+ s +"');");
comprimirHTML( html, docId );
//a.addFile(docId , "currentHTML.gz");
a.update("UPDATE Spider SET `fecha` = '"+ getDateTime() +"' WHERE `Spider`.`docId` = "+ docId +" LIMIT 1 ;");
}
a.cerrar();
}//metodo main
private static int comprimirHTML ( String html , String docId){
try{
PrintWriter pw = new PrintWriter(new FileWriter("docs/" + docId));
pw.println(html);
pw.close();
String cmd = "gzip docs/" + docId;
Process proc = Runtime.getRuntime().exec(cmd);
int exitVal = proc.waitFor();
return exitVal;
}catch(IOException ioe){
return -1;
}catch(InterruptedException ie){
return -1;
}
}
private static String getDateTime() {
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd");
Date date = new Date();
return dateFormat.format(date);
}
}//clase
class Conexion {
static String bd = "Buscador";
static String login = "root";
static String password = "";
static String url = "jdbc:mysql://localhost/"+bd;
Connection conexion = null;
public Conexion() throws Exception {
try {
abrir();
} catch(SQLException ex) {
System.out.println(ex);
} catch(ClassNotFoundException ex) {
System.out.println(ex);
}
}
public void abrir() throws Exception{
if (conexion == null){
// /usr/lib/jvm/java-6-openjdk/jre/lib/ext
Class.forName("org.gjt.mm.mysql.Driver");//cargamos el driver
conexion = DriverManager.getConnection(url,login,password);//nos conectamos con la BD
System.out.println("Conexion activa");
} else {
System.out.println("Existe una conexion activa a" + bd);
}
}
public void cerrar() throws Exception{
if(conexion != null){
conexion.close();
conexion = null;
System.out.println("Se cerro la conexion satisfactoriamente.");
} else {
System.out.println("No existe conexion que cerrar");
}
}
public ResultSet query(String consulta) {
try{
Statement estado = conexion.createStatement();
ResultSet rs = estado.executeQuery(consulta);
return rs;
}catch(Exception e){
System.out.println(e);
}
return null;
}
public int update(String consulta) {
try{
Statement estado = conexion.createStatement();
int rs = estado.executeUpdate(consulta);
return rs;
}catch(com.mysql.jdbc.exceptions.MySQLIntegrityConstraintViolationException micve){
}catch(Exception e){
System.out.println(e);
}
return -1;
}
public void addFile( String docId, String fileName ) throws IOException, SQLException{
File f = new File( fileName );
PreparedStatement stmt;
// otherwise read it and save it to the database
FileInputStream fis = new FileInputStream(f);
byte[] tmp = new byte[1024];
byte[] data = null;
int sz, len = 0;
while ((sz = fis.read(tmp)) != -1) {
if (data == null) {
len = sz;
data = tmp;
} else {
byte[] narr;
int nlen;
nlen = len + sz;
narr = new byte[nlen];
System.arraycopy(data, 0, narr, 0, len);
System.arraycopy(tmp, 0, narr, len, sz);
data = narr;
len = nlen;
}
}
if (len != data.length) {
byte[] narr = new byte[len];
System.arraycopy(data, 0, narr, 0, len);
data = narr;
}
stmt = conexion.prepareStatement("UPDATE Spider SET html = ? WHERE `Spider`.`docId` = ? LIMIT 1 ;");
stmt.setObject(1, data);
stmt.setString(2, docId);
stmt.executeUpdate();
f.delete();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment