cocodrips/gist:5647741

## gistfile1.py
def extract_links(self, url):
    urls = []
    htmlPattern = re.compile(r'(http://[A-Za-z0-9\'~+\-=_.,/%\?!;:@#\*&\(\)]+)')

    if htmlPattern.match(url) is None:
        return "error"

    try:
        html = urllib2.urlopen(url)
    except urllib2.URLError, e:
        if e.code >= 400:
            return "error"
        else:
            return "error"

    try:
        soup = BeautifulSoup(html.read(), "html5lib")
        for _a in soup.findAll('a'):
            href = _a.get('href')
            if href != None:
                m = htmlPattern.match(str(href.encode('utf_8')))
                if m != None:
                    urls.append(href)
        return urls
    except:
        return []
	def extract_links(self, url):
	urls = []
	htmlPattern = re.compile(r'(http://[A-Za-z0-9\'~+\-=_.,/%\?!;:@#\*&\(\)]+)')

	if htmlPattern.match(url) is None:
	return "error"

	try:
	html = urllib2.urlopen(url)
	except urllib2.URLError, e:
	if e.code >= 400:
	return "error"
	else:
	return "error"

	try:
	soup = BeautifulSoup(html.read(), "html5lib")
	for _a in soup.findAll('a'):
	href = _a.get('href')
	if href != None:
	m = htmlPattern.match(str(href.encode('utf_8')))
	if m != None:
	urls.append(href)
	return urls
	except:
	return []