Created
August 8, 2018 01:14
-
-
Save bitsnaps/0d793900a3112de68e3e2dc0bf7e3004 to your computer and use it in GitHub Desktop.
Simple groovy crawler from chinabestprice.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('org.jsoup:jsoup:1.11.2') | |
import org.jsoup.* | |
import org.jsoup.nodes.* | |
import org.jsoup.select.* | |
import groovy.json.* | |
class Main { | |
static main(args){ | |
def categories = ['smartphones':4039, 'laptop':455] | |
def brands = ['Xiaomi', 'Huawei', 'Asus', 'Lenovo'] | |
brands.each { brandName -> | |
categories.each { category -> | |
parseUrl(brandName, category.key, category.value) | |
} | |
} | |
if (Product.products.size()>0) | |
new File("${System.properties['user.dir']}/products.json").write(new JsonBuilder(Product.products).toString()) | |
} | |
static parseUrl(String brandName, String category, int categoryId){ | |
def url = "https://www.chinabestprice.com/search?qs=${brandName}&catID=${categoryId}" | |
def seconds = 0* 50*1000 // 10 seconds (default timeout is 30 seconds, 0 for infinite) | |
def jsoup = new Jsoup() | |
def doc = jsoup.connect(url).timeout(seconds).get() | |
def products = doc.select('div.prod') //.first() // just the first for testing | |
// println(products.html()); return // in case you want to check first | |
products.each { p -> | |
try{ | |
def title = p.select('div.caption h6').first().text() | |
def priceWrapper = p.select('div.price-wrapper') | |
def price = priceWrapper.select('span.sale-price').first().text().replace('US $','') | |
def freeShipping = priceWrapper.select('span.free-shipping').size()==0//.with{ it.size()>0?it.attr('title'):''} | |
def (listPrice, discountRate) = p.select('div.discount-wrapper').with{ discountWrapper -> | |
if (discountWrapper.size()>0){ | |
[discountWrapper.select('span.list-price').with{it.size()>0?it.first().text().replace('$',''):0}, | |
discountWrapper.select('span.discount-rate').with{ it.size()>0?it.first().text().replace('% OFF',''):0}] | |
} else | |
[0,0] | |
} | |
def desc = title.replace('\'', '') // remove quotes | |
def store = p.select('a.store').with { it.size()>0?it.text():''} | |
def name = desc.split(' ').take(5).join(' ') | |
def imageUrl = p.select('div img').attr('src').with { | |
if (it.size()>0){ | |
return (it.startsWith('//')?"http:${it}":it) | |
} else return '' | |
} | |
if (imageUrl) | |
try { | |
def bytes = jsoup.connect(imageUrl).ignoreContentType(true).execute().bodyAsBytes() | |
//println(new File(imageUrl).name+', size: '+bytes.size()) | |
new File("${System.properties['user.dir']}/images/${new File(imageUrl).name}").withOutputStream { | |
it.write( bytes ) | |
} | |
} catch (Exception ex){ } | |
int quantity = Math.random()*100 as Integer // qunatity is not available so we use dummy var here | |
new Product(freeShipping: "${freeShipping}", imageUrl:"'${imageUrl}'", listPrice:"${listPrice}", discountRate:"${discountRate}", store:"'${store}'", name: "'${name}'", description: "'${desc}'", brand:"'${brandName}'", category:"'${category}'", quantity:"${quantity}", price: "${price}").save() | |
} catch(Exception e) { | |
println(e.message) | |
println(p.html()) | |
} | |
} | |
} //parseUrl() | |
} // Main | |
class Product { | |
private static List products = [] | |
boolean freeShipping | |
String imageUrl | |
double listPrice | |
int discountRate | |
String store | |
String name | |
String description | |
String brand | |
String category | |
int quantity | |
double price | |
def save(){ | |
// you may need to do more checks (avoid duplicates, validating, sanitizing...) | |
products << this | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment