danker/TripleDScreenScraping.groovy

## TripleDScreenScraping.groovy
// Required jars (I know, I know...I should investigate Grape)
// nekohtml.jar
// xercesimpl.jar

// Define the pages which contain links to products - our "seeds" in crawl parlance.
def seeds = ["http://www.foodnetwork.com/shows/guys-diners-drive-ins-and-dives/index.html"]

// Load the NekoHTML parser with Xerces - this lets us parse the HTML.
slurper = new XmlSlurper(new org.cyberneko.html.parsers.SAXParser())

// Now let's loop through each seed URL in turn.
seeds.each() {

    println "Accessing seed URL ${it}"
    def seedURL = new URL(it)

    seedURL.withReader { seedReader ->
        def seedHTML = slurper.parse(seedReader)

        // Show the title of the seed page we're parsing.
        println "Seed page title is ${seedHTML.depthFirst().grep { it.name() == 'TITLE'}}"

        seedHTML.depthFirst().find { it.@class == 'body-text' }.children().each { p ->

            if (p.children().size() > 2) {

                def text = p.text()

                println text
                text.find(/(.*?)(\d+.+\d{5})/) { match, name, address ->
                    println "NAME = " + name
                    println "ADDRESS = " + address
                }
                println "TELEPHONE = " + text.find(/(\(\d{3}\).*?\d{3}.*?\d{4})/)
                text.find(/(Website:\s*)(.*)/) { match, website, url ->
                    println "WEBSITE = http://" + url
                }

                println "-**-" * 20

            }
        }

    }
}
	// Required jars (I know, I know...I should investigate Grape)
	// nekohtml.jar
	// xercesimpl.jar

	// Define the pages which contain links to products - our "seeds" in crawl parlance.
	def seeds = ["http://www.foodnetwork.com/shows/guys-diners-drive-ins-and-dives/index.html"]

	// Load the NekoHTML parser with Xerces - this lets us parse the HTML.
	slurper = new XmlSlurper(new org.cyberneko.html.parsers.SAXParser())

	// Now let's loop through each seed URL in turn.
	seeds.each() {

	println "Accessing seed URL ${it}"
	def seedURL = new URL(it)

	seedURL.withReader { seedReader ->
	def seedHTML = slurper.parse(seedReader)

	// Show the title of the seed page we're parsing.
	println "Seed page title is ${seedHTML.depthFirst().grep { it.name() == 'TITLE'}}"

	seedHTML.depthFirst().find { it.@class == 'body-text' }.children().each { p ->

	if (p.children().size() > 2) {

	def text = p.text()

	println text
	text.find(/(.*?)(\d+.+\d{5})/) { match, name, address ->
	println "NAME = " + name
	println "ADDRESS = " + address
	}
	println "TELEPHONE = " + text.find(/(\(\d{3}\).?\d{3}.?\d{4})/)
	text.find(/(Website:\s)(.)/) { match, website, url ->
	println "WEBSITE = http://" + url
	}

	println "-*-" 20

	}
	}

	}
	}