Skip to content

Instantly share code, notes, and snippets.

@cocodrips
Last active December 17, 2015 17:39
Show Gist options
  • Save cocodrips/5647741 to your computer and use it in GitHub Desktop.
Save cocodrips/5647741 to your computer and use it in GitHub Desktop.
WebページのURLを抽出するだけ
def extract_links(self, url):
urls = []
htmlPattern = re.compile(r'(http://[A-Za-z0-9\'~+\-=_.,/%\?!;:@#\*&\(\)]+)')
if htmlPattern.match(url) is None:
return "error"
try:
html = urllib2.urlopen(url)
except urllib2.URLError, e:
if e.code >= 400:
return "error"
else:
return "error"
try:
soup = BeautifulSoup(html.read(), "html5lib")
for _a in soup.findAll('a'):
href = _a.get('href')
if href != None:
m = htmlPattern.match(str(href.encode('utf_8')))
if m != None:
urls.append(href)
return urls
except:
return []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment