Levi-Lesches/nytimes_headlines.py

## nytimes_headlines.py
import requests
from bs4 import BeautifulSoup
import json

URL = "https://www.nytimes.com/"
START_OF_JSON = "window.__preloadedData = "

def get_html():
  response = requests.get(URL)
  return response.text

def get_script(soup):
  scripts = soup.find_all("script")
  for script in scripts:
    content = str(script.string).strip()
    if content.startswith(START_OF_JSON):
      return content

def get_script_data(soup):
  script = get_script(soup)
  data = script [len(START_OF_JSON) : -1]  # semicolon at the end
  return json.loads(data)

def parse_data(data):
  result = set()
  data = data ["initialState"]
  for key in data:
    if "headline" in data [key]:
      headline = data [key]["headline"]
      if not type(headline) is str or not headline: continue
      result.add(headline)
  return result

def get_headlines():
  html = get_html()
  soup = BeautifulSoup(html, 'html.parser')
  script_data = get_script_data(soup)
  result = parse_data(script_data)
  return result

def main():
  headlines = get_headlines()
  print(f"Found {len(headlines)} headlines")
  print("\n".join(headlines))


main()
	import requests
	from bs4 import BeautifulSoup
	import json

	URL = "https://www.nytimes.com/"
	START_OF_JSON = "window.__preloadedData = "

	def get_html():
	response = requests.get(URL)
	return response.text

	def get_script(soup):
	scripts = soup.find_all("script")
	for script in scripts:
	content = str(script.string).strip()
	if content.startswith(START_OF_JSON):
	return content

	def get_script_data(soup):
	script = get_script(soup)
	data = script [len(START_OF_JSON) : -1] # semicolon at the end
	return json.loads(data)

	def parse_data(data):
	result = set()
	data = data ["initialState"]
	for key in data:
	if "headline" in data [key]:
	headline = data [key]["headline"]
	if not type(headline) is str or not headline: continue
	result.add(headline)
	return result

	def get_headlines():
	html = get_html()
	soup = BeautifulSoup(html, 'html.parser')
	script_data = get_script_data(soup)
	result = parse_data(script_data)
	return result

	def main():
	headlines = get_headlines()
	print(f"Found {len(headlines)} headlines")
	print("\n".join(headlines))


	main()