Created
December 7, 2019 15:22
-
-
Save lvngd/162e65a883c91d2e9feb1f842cca720e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from requests_html import HTMLSession | |
class FashionArticleScraper: | |
def __init__(self, csv_file_path): | |
self.csv_file_path = csv_file_path | |
self.base_url = 'https://www.nytimes.com/section/fashion' | |
self.headers = {'User-Agent': 'LVNGDBot 1.0'} | |
self.process_fashion_articles() | |
def process_fashion_articles(self): | |
"""starts scraper and writes article data to csv""" | |
article_data_rows = [] | |
articles = self.get_fashion_article_links() | |
for article in articles: | |
data_row = self.parse_article(article) | |
if data_row: | |
article_data_rows.append(data_row) | |
self.write_articles_to_csv(article_data_rows) | |
return | |
def filter_fashion_articles(self,links): | |
"""returns only articles and filters out slideshows and certain other articles""" | |
fashion_links = [] | |
for link in links: | |
if link.endswith('html'): | |
if not link.startswith('/slideshow/') and not link.startswith('http') and not '/self-care/' in link: | |
if link not in fashion_links: | |
fashion_links.append(link) | |
return fashion_links | |
def get_fashion_article_links(self): | |
"""scrapes the first page of the fashion section and returns article links""" | |
session = HTMLSession() | |
r = session.get(self.base_url, headers=self.headers) | |
links = r.html.links | |
filtered_links = self.filter_fashion_articles(links) | |
return filtered_links | |
def parse_article(self,link): | |
all_text = [] | |
session = HTMLSession() | |
full_link = ''.join(['https://www.nytimes.com',link]) | |
r = session.get(full_link, headers=self.headers) | |
row = None | |
if r: | |
author_section = r.html.find('span.css-1baulvz') | |
if author_section: | |
author = author_section[0].text | |
else: | |
author = None | |
article_texts = r.html.find('div.StoryBodyCompanionColumn') | |
for text in article_texts: | |
section = text.text | |
all_text.append(section) | |
content = ' '.join(all_text) | |
content = content.replace("\n"," ") | |
row = (link, author,content) | |
return row | |
def write_articles_to_csv(self,rows): | |
with open(self.csv_file_path,'w') as csv_file: | |
headers = ('url','author','content') | |
writer = csv.writer(csv_file) | |
writer.writerow(headers) | |
writer.writerows(rows) | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment