Created
May 13, 2017 10:44
-
-
Save cell13/ab309c0943d362a8a5c9e3e510c8e529 to your computer and use it in GitHub Desktop.
python_tools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Get links from website, demo. | |
#if there is no bs4, you should use the command to install it first | |
#pip install beautifulsoup4 | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
html_page = urllib2.urlopen("http://arstechnica.com") | |
soup = BeautifulSoup(html_page,"html.parser") | |
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}): | |
print link.get('href') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Function to extract links from webpage, demo | |
#if there is no bs4, you should use the command to install it first | |
#pip install beautifulsoup4 | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
def getLinks(url): | |
html_page = urllib2.urlopen(url) | |
soup = BeautifulSoup(html_page,"html.parser") | |
links = [] | |
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}): | |
links.append(link.get('href')) | |
return links | |
print( getLinks("http://arstechnica.com") ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment