Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrapes headlines from the front page of nytimes.com
import requests
from bs4 import BeautifulSoup
import json
URL = "https://www.nytimes.com/"
START_OF_JSON = "window.__preloadedData = "
def get_html():
response = requests.get(URL)
return response.text
def get_script(soup):
scripts = soup.find_all("script")
for script in scripts:
content = str(script.string).strip()
if content.startswith(START_OF_JSON):
return content
def get_script_data(soup):
script = get_script(soup)
data = script [len(START_OF_JSON) : -1] # semicolon at the end
return json.loads(data)
def parse_data(data):
result = set()
data = data ["initialState"]
for key in data:
if "headline" in data [key]:
headline = data [key]["headline"]
if not type(headline) is str or not headline: continue
result.add(headline)
return result
def get_headlines():
html = get_html()
soup = BeautifulSoup(html, 'html.parser')
script_data = get_script_data(soup)
result = parse_data(script_data)
return result
def main():
headlines = get_headlines()
print(f"Found {len(headlines)} headlines")
print("\n".join(headlines))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment