Skip to content

Instantly share code, notes, and snippets.

@rhzs
Last active August 29, 2015 14:26
Show Gist options
  • Save rhzs/417ef552587aa5dff6b0 to your computer and use it in GitHub Desktop.
Save rhzs/417ef552587aa5dff6b0 to your computer and use it in GitHub Desktop.
Simple java groovy example for data and web scrapping via import.io API
// Author: Rheza Satria (2015), Indonesia
// Purpose:
// Simple groovy example for data and web crawler/scrapping via import.io API.
// Note:
// Lazada is Indonesian e-commerce. Feel free to change with other e-commerce.
// Run:
// groovy lazada_crawl.groovy
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
@Grab('mysql:mysql-connector-java:5.1.25')
@GrabConfig(systemClassLoader = true)
import groovy.sql.Sql
// Need to create mysql database called 'lazada'
def sql = Sql.newInstance("jdbc:mysql://localhost:3306/lazada", "root","password", "com.mysql.jdbc.Driver")
// simple JSON dump, later we can consume the JSON data via scheduler to get the price and its product information.
sql.execute('create table IF NOT EXISTS jsondata(id INT NOT NULL AUTO_INCREMENT, json MEDIUMTEXT NOT NULL, url TEXT, status VARCHAR(255), PRIMARY KEY(id))');
sql.close()
def getDataFromLazada(String lazadaUrl, int page, String otherOptions) {
String url = "https://api.import.io/store/data/_magic";
URL obj = new URL(url);
HttpURLConnection conn = (HttpURLConnection) obj.openConnection();
conn.setReadTimeout(30000);
conn.addRequestProperty("Accept-Language", "en-US,en;q=0.8");
conn.addRequestProperty("User-Agent", "Mozilla");
// conn.addRequestProperty("Referer", "google.com"); // forgery huh!?
conn.setDoOutput(true);
def w = new OutputStreamWriter(conn.getOutputStream(), "UTF-8");
String reqUrl = "http://"+lazadaUrl + "?page=" + page + (otherOptions ? otherOptions : "");
// println '{"url":"'+ reqUrl + '","apiVersionGuid":null,"cookies":null}'
// println '{"url":"${lazadaUrl}?page=${page}","apiVersionGuid":null,"cookies":null}'
w.write('{"url":"'+ reqUrl + '","apiVersionGuid":null,"cookies":null}');
w.close();
println "Requested URL: " + url;
int status = conn.getResponseCode();
// println "Response Code ... " + status;
def inside = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String inputLine;
StringBuffer html = new StringBuffer();
while ((inputLine = inside.readLine()) != null) {
html.append(inputLine);
}
inside.close();
conn.disconnect();
// Define your mysql username and password to connect
def sql = Sql.newInstance("jdbc:mysql://localhost:3306/lazada", "root","password", "com.mysql.jdbc.Driver")
def params = [html.toString(), reqUrl, 'ACTIVE']
sql.execute 'INSERT INTO jsondata(json, url, status) VALUES (?, ?, ?)', params
sql.close()
}
// Uncomment below lines to get it work!
// 248.times { // as of 22 June 2015 - www.lazada.co.id/beli-smartphone/
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/beli-smartphone/", it);
// }
// 596 pages as of 22 June 2015 - www.lazada.co.id/fashion-wanita/
// 597.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/fashion-wanita/", it, "&itemperpage=120");
// }
// 210 pages as of 22 June 2015 - http://www.lazada.co.id/fashion-pria/
// 211.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/fashion-pria/", it, "&itemperpage=120");
// }
// 33.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/fashion-anak-perempuan/", it, "&itemperpage=120");
// }
// 19.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/fashion-anak-laki-laki/", it, "&itemperpage=120");
// }
// 21.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/beli-kacamata-pria/", it, "&itemperpage=120");
// }
// 23.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/beli-kacamata-wanita/", it, "&itemperpage=120");
// }
// 17.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/koper/", it, "&itemperpage=120");
// }
// 20.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/aksesoris-travel/", it, "&itemperpage=120");
// }
// 124.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/tas-dan-tas-ransel/", it, "&itemperpage=120");
// }
// 5.times {
// if (it > 0)
// getDataFromLazada("www.lazada.co.id/penawaran-khusus-tas-koper/", it, "&itemperpage=120");
// }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment