Created
July 30, 2014 07:54
-
-
Save alanboy/10fcfc4c7a229e0bbd2e to your computer and use it in GitHub Desktop.
Web Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.net.*; | |
import java.io.*; | |
import java.util.Date; | |
import java.util.regex.*; | |
import java.util.*; | |
import java.sql.*; | |
import java.util.Date; | |
import java.text.DateFormat; | |
import java.text.SimpleDateFormat; | |
class Main{ | |
static String getContent( String url ) throws Exception { | |
URL hp = new URL( url ); | |
URLConnection hpCon = hp.openConnection(); | |
InputStream input = null; | |
try{ | |
input = hpCon.getInputStream(); | |
}catch(IOException ioe){ | |
return ioe.toString() ; | |
} | |
String contenido = ""; | |
int c; | |
while((c = input.read()) != -1) | |
contenido += (char)c; | |
return( contenido ); | |
}//metodo | |
/* | |
* @return ArrayList<String> | |
* @autor alanboy | |
* | |
*/ | |
public static ArrayList<String> parseLinks( URL url, String html ){ | |
//buscar <a ... href = "" ... > ... </a> | |
Matcher m = Pattern.compile("\\<a.*?href.*?>.*?</a>").matcher( html ); | |
ArrayList<String> links = new ArrayList<String>(); | |
while( m.find() ){ | |
String link = m.group(); | |
//buscar a la propiedad href | |
Matcher ml = Pattern.compile("href *?= *?\".*?\"").matcher( link ); | |
while ( ml.find() ){ | |
String href = ml.group(); | |
href = href.substring ( href.indexOf("\"") + 1, href.length() - 1 ); | |
//buscar por hrefs vacios | |
if(href.length() == 0) | |
continue; | |
try{ | |
href = new URL( href ).toExternalForm(); | |
}catch(MalformedURLException mue){ | |
try{ | |
URL relativeURL; | |
relativeURL = new URL ( url, href); | |
href = relativeURL.toExternalForm (); | |
}catch(MalformedURLException mue2){ | |
//esto no es un hipervinculo definitivamente | |
continue; | |
} | |
} | |
links.add( href ); | |
}//while | |
}//while | |
return links; | |
}//metodo | |
public static void main(String [] args) throws Exception{ | |
Conexion a = new Conexion(); | |
for(int i = 0 ; i < 50 ; i++){ | |
//siguiente url | |
System.out.print("siguiente url..."); | |
ResultSet rs = a.query("SELECT docId, url FROM Spider WHERE fecha IS NULL LIMIT 1"); | |
rs.next(); | |
String url = rs.getString("url"); | |
String docId = rs.getString("docId"); | |
System.out.println("[ok]"); | |
System.out.println("url: "+ url); | |
//get raw html code from url | |
System.out.print("getContent..."); | |
String html = getContent( url ); | |
System.out.println("[ok]"); | |
//get links from that html code | |
System.out.print("parseLinks..."); | |
ArrayList<String> links = parseLinks( new URL( url ), html ); | |
System.out.println("[ok]"); | |
for( String s : links ) | |
a.update("INSERT INTO `Buscador`.`Spider` (`url`) VALUES ('"+ s +"');"); | |
comprimirHTML( html, docId ); | |
//a.addFile(docId , "currentHTML.gz"); | |
a.update("UPDATE Spider SET `fecha` = '"+ getDateTime() +"' WHERE `Spider`.`docId` = "+ docId +" LIMIT 1 ;"); | |
} | |
a.cerrar(); | |
}//metodo main | |
private static int comprimirHTML ( String html , String docId){ | |
try{ | |
PrintWriter pw = new PrintWriter(new FileWriter("docs/" + docId)); | |
pw.println(html); | |
pw.close(); | |
String cmd = "gzip docs/" + docId; | |
Process proc = Runtime.getRuntime().exec(cmd); | |
int exitVal = proc.waitFor(); | |
return exitVal; | |
}catch(IOException ioe){ | |
return -1; | |
}catch(InterruptedException ie){ | |
return -1; | |
} | |
} | |
private static String getDateTime() { | |
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd"); | |
Date date = new Date(); | |
return dateFormat.format(date); | |
} | |
}//clase | |
class Conexion { | |
static String bd = "Buscador"; | |
static String login = "root"; | |
static String password = ""; | |
static String url = "jdbc:mysql://localhost/"+bd; | |
Connection conexion = null; | |
public Conexion() throws Exception { | |
try { | |
abrir(); | |
} catch(SQLException ex) { | |
System.out.println(ex); | |
} catch(ClassNotFoundException ex) { | |
System.out.println(ex); | |
} | |
} | |
public void abrir() throws Exception{ | |
if (conexion == null){ | |
// /usr/lib/jvm/java-6-openjdk/jre/lib/ext | |
Class.forName("org.gjt.mm.mysql.Driver");//cargamos el driver | |
conexion = DriverManager.getConnection(url,login,password);//nos conectamos con la BD | |
System.out.println("Conexion activa"); | |
} else { | |
System.out.println("Existe una conexion activa a" + bd); | |
} | |
} | |
public void cerrar() throws Exception{ | |
if(conexion != null){ | |
conexion.close(); | |
conexion = null; | |
System.out.println("Se cerro la conexion satisfactoriamente."); | |
} else { | |
System.out.println("No existe conexion que cerrar"); | |
} | |
} | |
public ResultSet query(String consulta) { | |
try{ | |
Statement estado = conexion.createStatement(); | |
ResultSet rs = estado.executeQuery(consulta); | |
return rs; | |
}catch(Exception e){ | |
System.out.println(e); | |
} | |
return null; | |
} | |
public int update(String consulta) { | |
try{ | |
Statement estado = conexion.createStatement(); | |
int rs = estado.executeUpdate(consulta); | |
return rs; | |
}catch(com.mysql.jdbc.exceptions.MySQLIntegrityConstraintViolationException micve){ | |
}catch(Exception e){ | |
System.out.println(e); | |
} | |
return -1; | |
} | |
public void addFile( String docId, String fileName ) throws IOException, SQLException{ | |
File f = new File( fileName ); | |
PreparedStatement stmt; | |
// otherwise read it and save it to the database | |
FileInputStream fis = new FileInputStream(f); | |
byte[] tmp = new byte[1024]; | |
byte[] data = null; | |
int sz, len = 0; | |
while ((sz = fis.read(tmp)) != -1) { | |
if (data == null) { | |
len = sz; | |
data = tmp; | |
} else { | |
byte[] narr; | |
int nlen; | |
nlen = len + sz; | |
narr = new byte[nlen]; | |
System.arraycopy(data, 0, narr, 0, len); | |
System.arraycopy(tmp, 0, narr, len, sz); | |
data = narr; | |
len = nlen; | |
} | |
} | |
if (len != data.length) { | |
byte[] narr = new byte[len]; | |
System.arraycopy(data, 0, narr, 0, len); | |
data = narr; | |
} | |
stmt = conexion.prepareStatement("UPDATE Spider SET html = ? WHERE `Spider`.`docId` = ? LIMIT 1 ;"); | |
stmt.setObject(1, data); | |
stmt.setString(2, docId); | |
stmt.executeUpdate(); | |
f.delete(); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment