Skip to content

Instantly share code, notes, and snippets.

@julien-h
Created June 11, 2018 07:10
Show Gist options
  • Save julien-h/f1f9eb7af87376bd5baf393c66daedee to your computer and use it in GitHub Desktop.
Save julien-h/f1f9eb7af87376bd5baf393c66daedee to your computer and use it in GitHub Desktop.
Scrapping with urllib and BeautifulSoup / python3
# First, use URLLIB to fetch HTML files
# -----------------------------------------------------------------------
from urllib.request import Request, urlopen
from urllib.error import URLError
def get_html(url):
# construct an http request for the given url
req = Request(url,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
# send request and fetch html
html = None
try:
html = urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
# on error, simply return an empty binary string
if html is None:
print('Server not found')
html = b''
# on success, read the html content into a binary string
else:
html = html.read()
return html
# Then, use BEAUTIFULSOUP to parse HTML
# -----------------------------------------------------------------------
from bs4 import BeautifulSoup
# Fetch html from url
url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b'
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
# Search for the main div, which is the div with the most paragraphs
ps = soup.select('p')
parents = [p.parent for p in ps]
def count_child_paragraphs(element):
return len(element.findAll('p', recurvise=False))
parents.sort(key = count_child_paragraphs, reverse=True)
main_div = parents[0]
# Add the main title (h1) if it's not already there
if not main_div.findAll('h1'):
titles = soup.findAll('h1')
if titles:
main_title = titles[0]
main_div.insert(0, main_title)
# That's it, we have the main content, let's write it to a new file
with open('output.html', 'w') as file:
file.write(str(main_div))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment