Skip to content

Instantly share code, notes, and snippets.

@Sparrow1029
Last active July 12, 2019 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sparrow1029/29637c68b6b810baac227afc33e86029 to your computer and use it in GitHub Desktop.
Save Sparrow1029/29637c68b6b810baac227afc33e86029 to your computer and use it in GitHub Desktop.
Python Code Challenge 19 - scrape web article with requests and BS4
#!/usr/bin/env python3
"""
https://www.practicepython.org/exercise/2014/07/14/19-decode-a-web-page-two.html
----------------------------------------------
Exercise 19 - Decode a webpage part 2
----------------------------------------------
Monica Lewinsky Vanity fair article:
https://www.vanityfair.com/style/society/2014/06/monica-lewinsky-humiliation-culture
"""
import os
import requests
from bs4 import BeautifulSoup
url = "https://www.vanityfair.com/style/society/2014/06/monica-lewinsky-humiliation-culture"
if os.path.isfile('Article.html'):
data = open('Article.html', 'r').read()
else:
print("requesting...")
r = requests.get(url)
print(r.status_code)
data = r.content
print(f"content-length: {len(data)}")
file_obj = open('Article.html', 'wb')
file_obj.write(r.content)
soup = BeautifulSoup(data, "lxml")
file_text = []
title = soup.find("h1")
file_text.append(title.text)
for div in soup.find_all("div", class_="byline"):
if 'bylines__byline' in div['class']:
file_text.append(div.text)
date = soup.find("time")
file_text.append(date.text)
for p in soup.find_all("p"):
if not p.attrs:
file_text.append(p.text)
with open('MonicaLewinsky.txt', 'w') as fh:
for item in file_text:
fh.write(f"{item}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment