Skip to content

Instantly share code, notes, and snippets.

@lvngd
Created December 7, 2019 15:22
Show Gist options
  • Save lvngd/162e65a883c91d2e9feb1f842cca720e to your computer and use it in GitHub Desktop.
Save lvngd/162e65a883c91d2e9feb1f842cca720e to your computer and use it in GitHub Desktop.
import csv
from requests_html import HTMLSession
class FashionArticleScraper:
def __init__(self, csv_file_path):
self.csv_file_path = csv_file_path
self.base_url = 'https://www.nytimes.com/section/fashion'
self.headers = {'User-Agent': 'LVNGDBot 1.0'}
self.process_fashion_articles()
def process_fashion_articles(self):
"""starts scraper and writes article data to csv"""
article_data_rows = []
articles = self.get_fashion_article_links()
for article in articles:
data_row = self.parse_article(article)
if data_row:
article_data_rows.append(data_row)
self.write_articles_to_csv(article_data_rows)
return
def filter_fashion_articles(self,links):
"""returns only articles and filters out slideshows and certain other articles"""
fashion_links = []
for link in links:
if link.endswith('html'):
if not link.startswith('/slideshow/') and not link.startswith('http') and not '/self-care/' in link:
if link not in fashion_links:
fashion_links.append(link)
return fashion_links
def get_fashion_article_links(self):
"""scrapes the first page of the fashion section and returns article links"""
session = HTMLSession()
r = session.get(self.base_url, headers=self.headers)
links = r.html.links
filtered_links = self.filter_fashion_articles(links)
return filtered_links
def parse_article(self,link):
all_text = []
session = HTMLSession()
full_link = ''.join(['https://www.nytimes.com',link])
r = session.get(full_link, headers=self.headers)
row = None
if r:
author_section = r.html.find('span.css-1baulvz')
if author_section:
author = author_section[0].text
else:
author = None
article_texts = r.html.find('div.StoryBodyCompanionColumn')
for text in article_texts:
section = text.text
all_text.append(section)
content = ' '.join(all_text)
content = content.replace("\n"," ")
row = (link, author,content)
return row
def write_articles_to_csv(self,rows):
with open(self.csv_file_path,'w') as csv_file:
headers = ('url','author','content')
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows)
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment