Skip to content

Instantly share code, notes, and snippets.

@danker
Created October 8, 2010 05:48
Show Gist options
  • Save danker/616419 to your computer and use it in GitHub Desktop.
Save danker/616419 to your computer and use it in GitHub Desktop.
// Required jars (I know, I know...I should investigate Grape)
// nekohtml.jar
// xercesimpl.jar
// Define the pages which contain links to products - our "seeds" in crawl parlance.
def seeds = ["http://www.foodnetwork.com/shows/guys-diners-drive-ins-and-dives/index.html"]
// Load the NekoHTML parser with Xerces - this lets us parse the HTML.
slurper = new XmlSlurper(new org.cyberneko.html.parsers.SAXParser())
// Now let's loop through each seed URL in turn.
seeds.each() {
println "Accessing seed URL ${it}"
def seedURL = new URL(it)
seedURL.withReader { seedReader ->
def seedHTML = slurper.parse(seedReader)
// Show the title of the seed page we're parsing.
println "Seed page title is ${seedHTML.depthFirst().grep { it.name() == 'TITLE'}}"
seedHTML.depthFirst().find { it.@class == 'body-text' }.children().each { p ->
if (p.children().size() > 2) {
def text = p.text()
println text
text.find(/(.*?)(\d+.+\d{5})/) { match, name, address ->
println "NAME = " + name
println "ADDRESS = " + address
}
println "TELEPHONE = " + text.find(/(\(\d{3}\).*?\d{3}.*?\d{4})/)
text.find(/(Website:\s*)(.*)/) { match, website, url ->
println "WEBSITE = http://" + url
}
println "-**-" * 20
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment