Skip to content

Instantly share code, notes, and snippets.

@julien-h
Last active June 30, 2018 23:31
Show Gist options
  • Save julien-h/e3570f920a8d1f96f080b1976bbe109c to your computer and use it in GitHub Desktop.
Save julien-h/e3570f920a8d1f96f080b1976bbe109c to your computer and use it in GitHub Desktop.
Basic script to scrap a website with python using the standard library. More error checking should go into it.
# -----------------------------------------------------------------------
from urllib.request import Request, urlopen
from urllib.error import URLError
def get_html(url):
# construct an http request for the given url
req = Request(url,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
' AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/35.0.1916.47 Safari/537.36'})
# send request and fetch html
html = None
try:
html = urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
# on error, simply return an empty binary string
if html is None:
print('Server not found')
html = b''
# on success, read the html content into a binary string
else:
html = html.read()
return html
# -----------------------------------------------------------------------
import re
url_binary_regex =
b'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def find_urls(html_binary):
urls = re.findall(url_binary_regex, html_binary)
urls = [url.decode('utf-8') for url in urls]
return urls
# -----------------------------------------------------------------------
from urllib.parse import urlparse
def has_bad_format(url):
exts = ['.gif', '.png', '.jpg']
return any(url.find(ext) >= 0 for ext in exts)
def filter_urls(urls, netloc):
urls = [url for url in urls if urlparse(url).netloc == netloc]
urls = [url for url in urls if not has_bad_format(url)]
return urls
# -----------------------------------------------------------------------
def process_html(url, b_html):
# do something usefull here
print('Visiting url : {}'.format(url))
start_url = 'http://www.google.com/'
to_visit = set([start_url])
visited = set()
while to_visit:
url = to_visit.pop()
visited.add(url)
html = get_html(url)
process_html(url, html)
links = find_urls(html)
links = filter_urls(links, 'www.google.com')
links = set(links)
newlinks = (links - visited) - to_visit
to_visit = to_visit | newlinks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment