himerzi/HTMLSpider.groovy

## HTMLSpider.groovy
/*
*Michael Detmold Groovy Coursework 1. November 2010.
*This program will  fetch a web page and then construct an index of all the other pages the web page links to. It can do this for several layers of linking.
*/

address = "http://www.google.com"//this is the base address to start building the list of links from
levels = 2 //this is how many levels of linking withing the base website the program should go.


// '/' DENOTES A REGULAR EXPRESSION
//forward slashes in REGEX need to be escaped
//this regex matches the href attribute and the  full URL adress, or  relative link, which will follow it.
regex = /[hH][Rr][Ee][Ff]=(("[Hh][Tt][Tt][Pp].*?")|((".*?\/?")))/

new File("LinkLister").mkdir() //directory where generated html files are stored

recursion(address,0)
def recursion(page,i){
    if(i==levels){
    }else{
        i++
        download(page)
        load()  //loads file
        def list = parse()
        write(list,page.tokenize("/")[-1])
        list.each{

            recursion(it,i)
        }

    }
}

def download(page)
{
    //Following based on http://groovy.codehaus.org/Simple+file+download+from+URL
    def file = new FileOutputStream("download")
    def out = new BufferedOutputStream(file)
    out << new URL(page).openStream()
    out.close()
}
//loads donwloaded file
def load(){
    def path = "/Users/md/download"
    file = new File(path)

}
def parse(){


    def temp  = []
    def list  = []
    //Using regular expressions, this closure extracts the href="" attribute of the anchor tag
    file.eachLine{aLine->
        if(aLine.findAll(regex).size() != 0){
            temp.addAll(aLine.findAll(regex)) //addAll method ensures that that list is "flat"
        }
        temp.unique() //removes duplicates


    }
    //Once the href attribute has been isolated, it is further parsed by tokenizing into just the URL or the relative link

    temp.each{

             if(it.tokenize("\"")[1]==~(/\/.*/)){
                 //relative links are converted into absolute links
                list << address + it.tokenize("\"")[1]
             }else if(!(it.tokenize("\"")[1]==~(/\//))){
             //does nothing if link is not formatted with a forward slash (otherwise a  MalformedURLException occurs)
             }else{
                //else, we are dealing with a ful absolute link, and there is no need to modify it
                list << it.tokenize("\"")[1]
             }
    }

    return list

}


def write(list,fileName){
//these are the html tags we will need to create the document
    println list
    def HTMLcode
    File aPage = new File("LinkLister/${fileName}.html")

    HTMLcode ="<UL>\n"
    HTMLcode += "<H1>$fileName has links to:</H1>\n"
    list.each{
        HTMLcode += """<LI><a href="$it">${it}</a>\n"""
    }
    HTMLcode += "</UL>\n"

    aPage.write(HTMLcode)
}
	/*
	*Michael Detmold Groovy Coursework 1. November 2010.
	*This program will fetch a web page and then construct an index of all the other pages the web page links to. It can do this for several layers of linking.
	*/

	address = "http://www.google.com"//this is the base address to start building the list of links from
	levels = 2 //this is how many levels of linking withing the base website the program should go.


	// '/' DENOTES A REGULAR EXPRESSION
	//forward slashes in REGEX need to be escaped
	//this regex matches the href attribute and the full URL adress, or relative link, which will follow it.
	regex = /[hH][Rr][Ee][Ff]=(("[Hh][Tt][Tt][Pp].?")\|((".?\/?")))/

	new File("LinkLister").mkdir() //directory where generated html files are stored

	recursion(address,0)
	def recursion(page,i){
	if(i==levels){
	}else{
	i++
	download(page)
	load() //loads file
	def list = parse()
	write(list,page.tokenize("/")[-1])
	list.each{

	recursion(it,i)
	}

	}
	}

	def download(page)
	{
	//Following based on http://groovy.codehaus.org/Simple+file+download+from+URL
	def file = new FileOutputStream("download")
	def out = new BufferedOutputStream(file)
	out << new URL(page).openStream()
	out.close()
	}
	//loads donwloaded file
	def load(){
	def path = "/Users/md/download"
	file = new File(path)

	}
	def parse(){


	def temp = []
	def list = []
	//Using regular expressions, this closure extracts the href="" attribute of the anchor tag
	file.eachLine{aLine->
	if(aLine.findAll(regex).size() != 0){
	temp.addAll(aLine.findAll(regex)) //addAll method ensures that that list is "flat"
	}
	temp.unique() //removes duplicates


	}
	//Once the href attribute has been isolated, it is further parsed by tokenizing into just the URL or the relative link

	temp.each{

	if(it.tokenize("\"")[1]==~(/\/.*/)){
	//relative links are converted into absolute links
	list << address + it.tokenize("\"")[1]
	}else if(!(it.tokenize("\"")[1]==~(/\//))){
	//does nothing if link is not formatted with a forward slash (otherwise a MalformedURLException occurs)
	}else{
	//else, we are dealing with a ful absolute link, and there is no need to modify it
	list << it.tokenize("\"")[1]
	}
	}

	return list

	}


	def write(list,fileName){
	//these are the html tags we will need to create the document
	println list
	def HTMLcode
	File aPage = new File("LinkLister/${fileName}.html")

	HTMLcode ="<UL>\n"
	HTMLcode += "<H1>$fileName has links to:</H1>\n"
	list.each{
	HTMLcode += """<LI><a href="$it">${it}</a>\n"""
	}
	HTMLcode += "</UL>\n"

	aPage.write(HTMLcode)
	}