Created
June 21, 2011 09:31
-
-
Save dexterous/1037526 to your computer and use it in GitHub Desktop.
a not so quick script hacked together to scrape the product info from lenovo's product listing page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='1.2.1') | |
def parser = new XmlSlurper(new org.ccil.cowan.tagsoup.Parser()) | |
def html = parser.parse('http://shopap.lenovo.com/SEUILibrary/controller/e/inweb/LenovoPortal/en_IN/catalog.workflow:category.details?current-catalog-id=3634951826AE4D3881BFFF1AC5FCD957¤t-category-id=41AB4B1B55F74FF8833753D7713BB6D6&tab=1&runfacets=1&altercrumb=0&initpage=seriespage&filter=&page-size=200') | |
def model_rows = html.body.div.find { it.@id == 'bodywrap' }.div.table.find { it.@id == 'container' }.tr[1].td.table.tr[1].td.form.table.tr[1].td.find { it.@id == 'series_results_table' }.form.table.tr[1].td.table.tr.findAll { it.@class == 'td-bkg-nograd' } | |
def models = model_rows.inject([:]) { model, tr -> | |
def modelNum = tr.td[2].div.text() | |
model[modelNum] = [ | |
name: tr.td[2].p.text().trim(), | |
price: (tr.td[2].table[0].tr[1].td[1].text().replaceAll(',', '') - 'Rs.') as BigDecimal, | |
specs: parseSpecs(tr.td[4..6]*.text().collect { it.split(/· /)*.trim() }.flatten().findAll { it }) | |
] | |
return model | |
} | |
def parseSpecs(specLine) { | |
return [ | |
processor: (specLine[0] =~ /Intel® Core™ (\S{7}) Processor \( (\S+)GHz .* \)/)[0].with { [code: it[1], speed: it[2]] }, | |
os: OS.parseText(specLine[1]), | |
display: (specLine[2] =~ /(.{4}) " (\S+) (\S+)/)[0].with { [size: it[1], type: it[3], resolution: Resolution.parseText(it[2])] }, | |
ram: (specLine[3] =~ /(\d) GB .+ (\d+)MHz/)[0].with { [size: it[1] as Integer, speed: it[2] as Integer] }, | |
disk: (specLine[4] - ' SSD SATA' - 'GB') as Integer | |
] | |
} | |
println models. | |
collect { k, v -> "$k: $v" }. | |
join(('=' * 80).center(82, "\n")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
enum OS { | |
DOS('PC DOS 2000 License'), WIN('Genuine Windows 7 Professional 32'); | |
private final String text | |
private OS(text) { this.text = text } | |
public static parseText(text) { OS.values().find { it.text == text } } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
enum Resolution { | |
HD_PLUS(1600, 900), HD(1366, 768), WXGA_PLUS(1440, 900), WXGA(1280, 800) | |
private final int width, height | |
private Resolution(width, height) { | |
this.width = width | |
this.height = height | |
} | |
public String toString() { "${name().replaceAll('_PLUS', '+')} ($width x $height)" } | |
public static parseText(text) { Enum.valueOf(Resolution, text.replaceAll(/\+/, '_PLUS')) } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment