julien-h/BeautifulSoup scraping in python3

## BeautifulSoup scraping in python3
# First, use URLLIB to fetch HTML files
# -----------------------------------------------------------------------

from urllib.request import Request, urlopen
from urllib.error import URLError

def get_html(url):
    # construct an http request for the given url
    req = Request(url,
              data=None,
              headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})

    # send request and fetch html
    html = None
    try:
        html = urlopen(req)
    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code)

    # on error, simply return an empty binary string
    if html is None:
        print('Server not found')
        html = b''

    # on success, read the html content into a binary string
    else:
        html  = html.read()

    return html


# Then, use BEAUTIFULSOUP to parse HTML
# -----------------------------------------------------------------------

from bs4 import BeautifulSoup

# Fetch html from url

url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b'
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')

# Search for the main div, which is the div with the most paragraphs

ps = soup.select('p')
parents = [p.parent for p in ps]

def count_child_paragraphs(element):
    return len(element.findAll('p', recurvise=False))

parents.sort(key = count_child_paragraphs, reverse=True)
main_div = parents[0]

# Add the main title (h1) if it's not already there

if not main_div.findAll('h1'):
    titles = soup.findAll('h1')
    if titles:
        main_title = titles[0]
        main_div.insert(0, main_title)

# That's it, we have the main content, let's write it to a new file

with open('output.html', 'w') as file:
    file.write(str(main_div))
	# First, use URLLIB to fetch HTML files
	# -----------------------------------------------------------------------

	from urllib.request import Request, urlopen
	from urllib.error import URLError

	def get_html(url):
	# construct an http request for the given url
	req = Request(url,
	data=None,
	headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})

	# send request and fetch html
	html = None
	try:
	html = urlopen(req)
	except URLError as e:
	if hasattr(e, 'reason'):
	print('We failed to reach a server.')
	print('Reason: ', e.reason)
	elif hasattr(e, 'code'):
	print('The server couldn\'t fulfill the request.')
	print('Error code: ', e.code)

	# on error, simply return an empty binary string
	if html is None:
	print('Server not found')
	html = b''

	# on success, read the html content into a binary string
	else:
	html = html.read()

	return html




	# Then, use BEAUTIFULSOUP to parse HTML
	# -----------------------------------------------------------------------

	from bs4 import BeautifulSoup

	# Fetch html from url

	url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b'
	html = get_html(url)
	soup = BeautifulSoup(html, 'html.parser')

	# Search for the main div, which is the div with the most paragraphs

	ps = soup.select('p')
	parents = [p.parent for p in ps]

	def count_child_paragraphs(element):
	return len(element.findAll('p', recurvise=False))

	parents.sort(key = count_child_paragraphs, reverse=True)
	main_div = parents[0]

	# Add the main title (h1) if it's not already there

	if not main_div.findAll('h1'):
	titles = soup.findAll('h1')
	if titles:
	main_title = titles[0]
	main_div.insert(0, main_title)

	# That's it, we have the main content, let's write it to a new file

	with open('output.html', 'w') as file:
	file.write(str(main_div))