Skip to content

Instantly share code, notes, and snippets.

@bitsnaps
Created August 8, 2018 01:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bitsnaps/0d793900a3112de68e3e2dc0bf7e3004 to your computer and use it in GitHub Desktop.
Save bitsnaps/0d793900a3112de68e3e2dc0bf7e3004 to your computer and use it in GitHub Desktop.
Simple groovy crawler from chinabestprice.com
@Grab('org.jsoup:jsoup:1.11.2')
import org.jsoup.*
import org.jsoup.nodes.*
import org.jsoup.select.*
import groovy.json.*
class Main {
static main(args){
def categories = ['smartphones':4039, 'laptop':455]
def brands = ['Xiaomi', 'Huawei', 'Asus', 'Lenovo']
brands.each { brandName ->
categories.each { category ->
parseUrl(brandName, category.key, category.value)
}
}
if (Product.products.size()>0)
new File("${System.properties['user.dir']}/products.json").write(new JsonBuilder(Product.products).toString())
}
static parseUrl(String brandName, String category, int categoryId){
def url = "https://www.chinabestprice.com/search?qs=${brandName}&catID=${categoryId}"
def seconds = 0* 50*1000 // 10 seconds (default timeout is 30 seconds, 0 for infinite)
def jsoup = new Jsoup()
def doc = jsoup.connect(url).timeout(seconds).get()
def products = doc.select('div.prod') //.first() // just the first for testing
// println(products.html()); return // in case you want to check first
products.each { p ->
try{
def title = p.select('div.caption h6').first().text()
def priceWrapper = p.select('div.price-wrapper')
def price = priceWrapper.select('span.sale-price').first().text().replace('US $','')
def freeShipping = priceWrapper.select('span.free-shipping').size()==0//.with{ it.size()>0?it.attr('title'):''}
def (listPrice, discountRate) = p.select('div.discount-wrapper').with{ discountWrapper ->
if (discountWrapper.size()>0){
[discountWrapper.select('span.list-price').with{it.size()>0?it.first().text().replace('$',''):0},
discountWrapper.select('span.discount-rate').with{ it.size()>0?it.first().text().replace('% OFF',''):0}]
} else
[0,0]
}
def desc = title.replace('\'', '') // remove quotes
def store = p.select('a.store').with { it.size()>0?it.text():''}
def name = desc.split(' ').take(5).join(' ')
def imageUrl = p.select('div img').attr('src').with {
if (it.size()>0){
return (it.startsWith('//')?"http:${it}":it)
} else return ''
}
if (imageUrl)
try {
def bytes = jsoup.connect(imageUrl).ignoreContentType(true).execute().bodyAsBytes()
//println(new File(imageUrl).name+', size: '+bytes.size())
new File("${System.properties['user.dir']}/images/${new File(imageUrl).name}").withOutputStream {
it.write( bytes )
}
} catch (Exception ex){ }
int quantity = Math.random()*100 as Integer // qunatity is not available so we use dummy var here
new Product(freeShipping: "${freeShipping}", imageUrl:"'${imageUrl}'", listPrice:"${listPrice}", discountRate:"${discountRate}", store:"'${store}'", name: "'${name}'", description: "'${desc}'", brand:"'${brandName}'", category:"'${category}'", quantity:"${quantity}", price: "${price}").save()
} catch(Exception e) {
println(e.message)
println(p.html())
}
}
} //parseUrl()
} // Main
class Product {
private static List products = []
boolean freeShipping
String imageUrl
double listPrice
int discountRate
String store
String name
String description
String brand
String category
int quantity
double price
def save(){
// you may need to do more checks (avoid duplicates, validating, sanitizing...)
products << this
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment