Skip to content

Instantly share code, notes, and snippets.

@Levi-Lesches
Last active September 2, 2020 06:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Levi-Lesches/3f74dc35cc52672ba10f767de7a12d3b to your computer and use it in GitHub Desktop.
Save Levi-Lesches/3f74dc35cc52672ba10f767de7a12d3b to your computer and use it in GitHub Desktop.
Scrapes headlines from the front page of nytimes.com
import requests
from bs4 import BeautifulSoup
import json
URL = "https://www.nytimes.com/"
START_OF_JSON = "window.__preloadedData = "
def get_html():
response = requests.get(URL)
return response.text
def get_script(soup):
scripts = soup.find_all("script")
for script in scripts:
content = str(script.string).strip()
if content.startswith(START_OF_JSON):
return content
def get_script_data(soup):
script = get_script(soup)
data = script [len(START_OF_JSON) : -1] # semicolon at the end
return json.loads(data)
def parse_data(data):
result = set()
data = data ["initialState"]
for key in data:
if "headline" in data [key]:
headline = data [key]["headline"]
if not type(headline) is str or not headline: continue
result.add(headline)
return result
def get_headlines():
html = get_html()
soup = BeautifulSoup(html, 'html.parser')
script_data = get_script_data(soup)
result = parse_data(script_data)
return result
def main():
headlines = get_headlines()
print(f"Found {len(headlines)} headlines")
print("\n".join(headlines))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment