Skip to content

Instantly share code, notes, and snippets.

@cell13
Created May 13, 2017 10:44
Show Gist options
  • Save cell13/ab309c0943d362a8a5c9e3e510c8e529 to your computer and use it in GitHub Desktop.
Save cell13/ab309c0943d362a8a5c9e3e510c8e529 to your computer and use it in GitHub Desktop.
python_tools
#Get links from website, demo.
#if there is no bs4, you should use the command to install it first
#pip install beautifulsoup4
from bs4 import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://arstechnica.com")
soup = BeautifulSoup(html_page,"html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print link.get('href')
#Function to extract links from webpage, demo
#if there is no bs4, you should use the command to install it first
#pip install beautifulsoup4
from bs4 import BeautifulSoup
import urllib2
import re
def getLinks(url):
html_page = urllib2.urlopen(url)
soup = BeautifulSoup(html_page,"html.parser")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
return links
print( getLinks("http://arstechnica.com") )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment