lvngd/fashion_article_scraper.py

## fashion_article_scraper.py
import csv
from requests_html import HTMLSession

class FashionArticleScraper:
	def __init__(self, csv_file_path):
		self.csv_file_path = csv_file_path
		self.base_url = 'https://www.nytimes.com/section/fashion'
		self.headers = {'User-Agent': 'LVNGDBot 1.0'}
		self.process_fashion_articles()

	def process_fashion_articles(self):
		"""starts scraper and writes article data to csv"""
		article_data_rows = []
		articles = self.get_fashion_article_links()
		for article in articles:
			data_row = self.parse_article(article)
			if data_row:
				article_data_rows.append(data_row)
		self.write_articles_to_csv(article_data_rows)
		return

	def filter_fashion_articles(self,links):
		"""returns only articles and filters out slideshows and certain other articles"""
		fashion_links = []
		for link in links:
			if link.endswith('html'):
				if not link.startswith('/slideshow/') and not link.startswith('http') and not '/self-care/' in link:
					if link not in fashion_links:
						fashion_links.append(link)
		return fashion_links

	def get_fashion_article_links(self):
		"""scrapes the first page of the fashion section and returns article links"""
		session = HTMLSession()
		r = session.get(self.base_url, headers=self.headers)
		links = r.html.links
		filtered_links = self.filter_fashion_articles(links)
		return filtered_links

	def parse_article(self,link):
		all_text = []
		session = HTMLSession()
		full_link = ''.join(['https://www.nytimes.com',link])
		r = session.get(full_link, headers=self.headers)
		row = None
		if r:
			author_section = r.html.find('span.css-1baulvz')
			if author_section:
				author = author_section[0].text
			else:
				author = None
			article_texts = r.html.find('div.StoryBodyCompanionColumn')
			for text in article_texts:
				section = text.text
				all_text.append(section)
			content = ' '.join(all_text)
			content = content.replace("\n"," ")
			row = (link, author,content)
		return row

	def write_articles_to_csv(self,rows):
		with open(self.csv_file_path,'w') as csv_file:
			headers = ('url','author','content')
			writer = csv.writer(csv_file)
			writer.writerow(headers)
			writer.writerows(rows)
		return
	import csv
	from requests_html import HTMLSession

	class FashionArticleScraper:
	def __init__(self, csv_file_path):
	self.csv_file_path = csv_file_path
	self.base_url = 'https://www.nytimes.com/section/fashion'
	self.headers = {'User-Agent': 'LVNGDBot 1.0'}
	self.process_fashion_articles()

	def process_fashion_articles(self):
	"""starts scraper and writes article data to csv"""
	article_data_rows = []
	articles = self.get_fashion_article_links()
	for article in articles:
	data_row = self.parse_article(article)
	if data_row:
	article_data_rows.append(data_row)
	self.write_articles_to_csv(article_data_rows)
	return

	def filter_fashion_articles(self,links):
	"""returns only articles and filters out slideshows and certain other articles"""
	fashion_links = []
	for link in links:
	if link.endswith('html'):
	if not link.startswith('/slideshow/') and not link.startswith('http') and not '/self-care/' in link:
	if link not in fashion_links:
	fashion_links.append(link)
	return fashion_links

	def get_fashion_article_links(self):
	"""scrapes the first page of the fashion section and returns article links"""
	session = HTMLSession()
	r = session.get(self.base_url, headers=self.headers)
	links = r.html.links
	filtered_links = self.filter_fashion_articles(links)
	return filtered_links

	def parse_article(self,link):
	all_text = []
	session = HTMLSession()
	full_link = ''.join(['https://www.nytimes.com',link])
	r = session.get(full_link, headers=self.headers)
	row = None
	if r:
	author_section = r.html.find('span.css-1baulvz')
	if author_section:
	author = author_section[0].text
	else:
	author = None
	article_texts = r.html.find('div.StoryBodyCompanionColumn')
	for text in article_texts:
	section = text.text
	all_text.append(section)
	content = ' '.join(all_text)
	content = content.replace("\n"," ")
	row = (link, author,content)
	return row

	def write_articles_to_csv(self,rows):
	with open(self.csv_file_path,'w') as csv_file:
	headers = ('url','author','content')
	writer = csv.writer(csv_file)
	writer.writerow(headers)
	writer.writerows(rows)
	return