Created
September 15, 2011 15:46
-
-
Save himerzi/1219614 to your computer and use it in GitHub Desktop.
HMTL Link constructor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
*Michael Detmold Groovy Coursework 1. November 2010. | |
*This program will fetch a web page and then construct an index of all the other pages the web page links to. It can do this for several layers of linking. | |
*/ | |
address = "http://www.google.com"//this is the base address to start building the list of links from | |
levels = 2 //this is how many levels of linking withing the base website the program should go. | |
// '/' DENOTES A REGULAR EXPRESSION | |
//forward slashes in REGEX need to be escaped | |
//this regex matches the href attribute and the full URL adress, or relative link, which will follow it. | |
regex = /[hH][Rr][Ee][Ff]=(("[Hh][Tt][Tt][Pp].*?")|((".*?\/?")))/ | |
new File("LinkLister").mkdir() //directory where generated html files are stored | |
recursion(address,0) | |
def recursion(page,i){ | |
if(i==levels){ | |
}else{ | |
i++ | |
download(page) | |
load() //loads file | |
def list = parse() | |
write(list,page.tokenize("/")[-1]) | |
list.each{ | |
recursion(it,i) | |
} | |
} | |
} | |
def download(page) | |
{ | |
//Following based on http://groovy.codehaus.org/Simple+file+download+from+URL | |
def file = new FileOutputStream("download") | |
def out = new BufferedOutputStream(file) | |
out << new URL(page).openStream() | |
out.close() | |
} | |
//loads donwloaded file | |
def load(){ | |
def path = "/Users/md/download" | |
file = new File(path) | |
} | |
def parse(){ | |
def temp = [] | |
def list = [] | |
//Using regular expressions, this closure extracts the href="" attribute of the anchor tag | |
file.eachLine{aLine-> | |
if(aLine.findAll(regex).size() != 0){ | |
temp.addAll(aLine.findAll(regex)) //addAll method ensures that that list is "flat" | |
} | |
temp.unique() //removes duplicates | |
} | |
//Once the href attribute has been isolated, it is further parsed by tokenizing into just the URL or the relative link | |
temp.each{ | |
if(it.tokenize("\"")[1]==~(/\/.*/)){ | |
//relative links are converted into absolute links | |
list << address + it.tokenize("\"")[1] | |
}else if(!(it.tokenize("\"")[1]==~(/\//))){ | |
//does nothing if link is not formatted with a forward slash (otherwise a MalformedURLException occurs) | |
}else{ | |
//else, we are dealing with a ful absolute link, and there is no need to modify it | |
list << it.tokenize("\"")[1] | |
} | |
} | |
return list | |
} | |
def write(list,fileName){ | |
//these are the html tags we will need to create the document | |
println list | |
def HTMLcode | |
File aPage = new File("LinkLister/${fileName}.html") | |
HTMLcode ="<UL>\n" | |
HTMLcode += "<H1>$fileName has links to:</H1>\n" | |
list.each{ | |
HTMLcode += """<LI><a href="$it">${it}</a>\n""" | |
} | |
HTMLcode += "</UL>\n" | |
aPage.write(HTMLcode) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment