Skip to content

Instantly share code, notes, and snippets.

@himerzi
Created September 15, 2011 15:46
Show Gist options
  • Save himerzi/1219614 to your computer and use it in GitHub Desktop.
Save himerzi/1219614 to your computer and use it in GitHub Desktop.
HMTL Link constructor
/*
*Michael Detmold Groovy Coursework 1. November 2010.
*This program will fetch a web page and then construct an index of all the other pages the web page links to. It can do this for several layers of linking.
*/
address = "http://www.google.com"//this is the base address to start building the list of links from
levels = 2 //this is how many levels of linking withing the base website the program should go.
// '/' DENOTES A REGULAR EXPRESSION
//forward slashes in REGEX need to be escaped
//this regex matches the href attribute and the full URL adress, or relative link, which will follow it.
regex = /[hH][Rr][Ee][Ff]=(("[Hh][Tt][Tt][Pp].*?")|((".*?\/?")))/
new File("LinkLister").mkdir() //directory where generated html files are stored
recursion(address,0)
def recursion(page,i){
if(i==levels){
}else{
i++
download(page)
load() //loads file
def list = parse()
write(list,page.tokenize("/")[-1])
list.each{
recursion(it,i)
}
}
}
def download(page)
{
//Following based on http://groovy.codehaus.org/Simple+file+download+from+URL
def file = new FileOutputStream("download")
def out = new BufferedOutputStream(file)
out << new URL(page).openStream()
out.close()
}
//loads donwloaded file
def load(){
def path = "/Users/md/download"
file = new File(path)
}
def parse(){
def temp = []
def list = []
//Using regular expressions, this closure extracts the href="" attribute of the anchor tag
file.eachLine{aLine->
if(aLine.findAll(regex).size() != 0){
temp.addAll(aLine.findAll(regex)) //addAll method ensures that that list is "flat"
}
temp.unique() //removes duplicates
}
//Once the href attribute has been isolated, it is further parsed by tokenizing into just the URL or the relative link
temp.each{
if(it.tokenize("\"")[1]==~(/\/.*/)){
//relative links are converted into absolute links
list << address + it.tokenize("\"")[1]
}else if(!(it.tokenize("\"")[1]==~(/\//))){
//does nothing if link is not formatted with a forward slash (otherwise a MalformedURLException occurs)
}else{
//else, we are dealing with a ful absolute link, and there is no need to modify it
list << it.tokenize("\"")[1]
}
}
return list
}
def write(list,fileName){
//these are the html tags we will need to create the document
println list
def HTMLcode
File aPage = new File("LinkLister/${fileName}.html")
HTMLcode ="<UL>\n"
HTMLcode += "<H1>$fileName has links to:</H1>\n"
list.each{
HTMLcode += """<LI><a href="$it">${it}</a>\n"""
}
HTMLcode += "</UL>\n"
aPage.write(HTMLcode)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment