Last active
April 15, 2021 18:54
-
-
Save ageis/e45c31b8d0d058b6560f0a024bc09865 to your computer and use it in GitHub Desktop.
Extract href tag values (hyperlinks) from a webpage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import re | |
html_page = urllib2.urlopen("http://example.com/example.html") | |
soup = BeautifulSoup(html_page) | |
for link in soup.findAll('a'): | |
print link.get('href') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from HTMLParser import HTMLParser | |
class MyHTMLParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
if tag == "a": | |
for name, value in attrs: | |
if name == "href": | |
print name, "=", value | |
parser = MyHTMLParser() | |
parser.feed(your_html_string) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#-*- coding: utf-8 -*- | |
# usage ./find_hyperlinks.py "https://example.com/example.txt" | |
import os | |
import sys | |
import wget | |
from BeautifulSoup import BeautifulSoup | |
sys.setdefaultencoding('UTF8') | |
url = sys.argv[1] | |
filename = os.path.basename(url) | |
soup = BeautifulSoup(filename) | |
for tag in soup.findAll('a', href=True): | |
print(str(tag['href'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment