julien-h/Basic scrapping in python

## Basic scrapping in python
# -----------------------------------------------------------------------

from urllib.request import Request, urlopen
from urllib.error import URLError

def get_html(url):
    # construct an http request for the given url
    req = Request(url,
              data=None,
              headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
                                     ' AppleWebKit/537.36 (KHTML, like Gecko)'
                                     ' Chrome/35.0.1916.47 Safari/537.36'})

    # send request and fetch html
    html = None
    try:
        html = urlopen(req)
    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code)

    # on error, simply return an empty binary string
    if html is None:
        print('Server not found')
        html = b''

    # on success, read the html content into a binary string
    else:
        html  = html.read()

    return html


# -----------------------------------------------------------------------


import re

url_binary_regex =
    b'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def find_urls(html_binary):
    urls = re.findall(url_binary_regex, html_binary)
    urls = [url.decode('utf-8') for url in urls]
    return urls


# -----------------------------------------------------------------------


from urllib.parse import urlparse

def has_bad_format(url):
    exts = ['.gif', '.png', '.jpg']
    return any(url.find(ext) >= 0 for ext in exts)

def filter_urls(urls, netloc):
    urls = [url for url in urls if urlparse(url).netloc == netloc]
    urls = [url for url in urls if not has_bad_format(url)]
    return urls


# -----------------------------------------------------------------------


def process_html(url, b_html):
    # do something usefull here
    print('Visiting url : {}'.format(url))

start_url = 'http://www.google.com/'
to_visit = set([start_url])
visited = set()

while to_visit:
    url = to_visit.pop()
    visited.add(url)

    html = get_html(url)
    process_html(url, html)

    links = find_urls(html)
    links = filter_urls(links, 'www.google.com')
    links = set(links)
    newlinks = (links - visited) - to_visit

    to_visit = to_visit | newlinks
	# -----------------------------------------------------------------------

	from urllib.request import Request, urlopen
	from urllib.error import URLError

	def get_html(url):
	# construct an http request for the given url
	req = Request(url,
	data=None,
	headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)'
	' AppleWebKit/537.36 (KHTML, like Gecko)'
	' Chrome/35.0.1916.47 Safari/537.36'})

	# send request and fetch html
	html = None
	try:
	html = urlopen(req)
	except URLError as e:
	if hasattr(e, 'reason'):
	print('We failed to reach a server.')
	print('Reason: ', e.reason)
	elif hasattr(e, 'code'):
	print('The server couldn\'t fulfill the request.')
	print('Error code: ', e.code)

	# on error, simply return an empty binary string
	if html is None:
	print('Server not found')
	html = b''

	# on success, read the html content into a binary string
	else:
	html = html.read()

	return html


	# -----------------------------------------------------------------------


	import re

	url_binary_regex =
	b'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

	def find_urls(html_binary):
	urls = re.findall(url_binary_regex, html_binary)
	urls = [url.decode('utf-8') for url in urls]
	return urls


	# -----------------------------------------------------------------------


	from urllib.parse import urlparse

	def has_bad_format(url):
	exts = ['.gif', '.png', '.jpg']
	return any(url.find(ext) >= 0 for ext in exts)

	def filter_urls(urls, netloc):
	urls = [url for url in urls if urlparse(url).netloc == netloc]
	urls = [url for url in urls if not has_bad_format(url)]
	return urls


	# -----------------------------------------------------------------------


	def process_html(url, b_html):
	# do something usefull here
	print('Visiting url : {}'.format(url))

	start_url = 'http://www.google.com/'
	to_visit = set([start_url])
	visited = set()

	while to_visit:
	url = to_visit.pop()
	visited.add(url)

	html = get_html(url)
	process_html(url, html)

	links = find_urls(html)
	links = filter_urls(links, 'www.google.com')
	links = set(links)
	newlinks = (links - visited) - to_visit

	to_visit = to_visit \| newlinks