Skip to content

Instantly share code, notes, and snippets.

@ebarns
Created November 13, 2019 02:59
Show Gist options
  • Save ebarns/f32eedcff1a996a6821b15e07c205f81 to your computer and use it in GitHub Desktop.
Save ebarns/f32eedcff1a996a6821b15e07c205f81 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
def get_section_links(soup):
sections = []
for link in soup.find_all('a')[:20]:
print(link.text, link.get('href'))
if "/section/" in link.get(
'href'): # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
sections.append(link.get('href'))
return sections
def add_article_hrefs(articles, soup):
for lnk in soup.find_all('a')[:20]:
href = lnk.get('href', "")
if len(href) > 0 and "/2019/" in href and ".html" in href:
articles.add(href)
def main():
r = requests.get("https://www.nytimes.com")
soup = BeautifulSoup(r.text)
articles = set()
for section_url in get_section_links(soup):
soup = BeautifulSoup(requests.get(section_url).text)
add_article_hrefs(articles, soup)
article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]
bodies = []
for article in article_urls:
try:
r = requests.get(article)
soup = BeautifulSoup(r.text)
article_content = "".join([s.text for s in soup.find_all("section") if "name" in s.attrs])
print("contnet; ", article_content)
bodies.append(article_content)
except Exception as e:
print(e)
continue
print(article_urls)
print(bodies)
print(bodies)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment