Last active
November 9, 2022 16:35
-
-
Save haljpeg/fb15cd54f1fa29abab805d218ff21c8c to your computer and use it in GitHub Desktop.
substack_scraper_nov9
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'watir' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'csv' | |
require 'net/http' | |
require 'aws-sdk-s3' | |
# create scraper class | |
# scrapes the substack website for all the newsletters available there | |
class SubstackScraper | |
SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze | |
AUTOSAVE_FREQUENCY = 10 | |
CSV_FILE_NAME = 'substack_descriptions.csv'.freeze | |
def initialize | |
# create browser instance | |
Watir.default_timeout = 5 | |
# open headless browser | |
@browser = Watir::Browser.new :chrome, headless: true | |
# all newsletter substack | |
@all_substacks = [] | |
end | |
def call | |
# if substack_newsletters.csv exists load it | |
if File.exist?('substack_newsletters.csv') | |
puts 'Loading substack_newsletters.csv' | |
read_csv | |
else | |
puts 'Loading all substack newsletters' | |
# get all substacks | |
get_all_substacks | |
end | |
puts 'Total number of newsletters found: ' + @all_substacks.length.to_s | |
# clear csv file | |
#File.open(CSV_FILE_NAME, 'w') { |file| file.truncate(0) } | |
# read all existing urls that have descriptions | |
existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]} | |
# for each url scrape the description with get_substack_description function | |
@all_substacks.each_with_index do |url, index| | |
# if url is in the existing urls skip it | |
if existing_urls.include?(url) | |
puts 'Skipping ' + url | |
next | |
end | |
# get description | |
# add exception | |
begin | |
description = get_substack_description(url) | |
rescue => e | |
puts 'Error getting description for ' + url + ' ' + e.message | |
end | |
# get all post h1 titles from the url + sitemap.xml | |
post_titles = get_all_post_titles(url) | |
# save to csv | |
save_description_to_csv(url, description, post_titles) | |
puts "Done scraping #{url}" | |
# sleep for 4 second to prevent timeout | |
sleep 4 | |
# if 10 iterations save to aws | |
if (index + 1) % AUTOSAVE_FREQUENCY == 0 | |
puts 'Saving to AWS' | |
save_to_s3 | |
puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s | |
end | |
end | |
end | |
def save_to_s3 | |
# save CSV file to AWS s3 | |
# create s3 client | |
s3 = Aws::S3::Client.new(region: 'us-east-1') | |
# upload file to s3 | |
s3.put_object( | |
bucket: 'YOUR-BUCKET', | |
key: CSV_FILE_NAME, | |
body: File.open(CSV_FILE_NAME) | |
) | |
puts "Saved #{CSV_FILE_NAME} to AWS S3" | |
end | |
def get_all_post_titles(substack_url) | |
# get sitemap url | |
sitemap_url = substack_url + 'sitemap.xml' | |
# get sitemap xml | |
sitemap_xml = Nokogiri::XML(URI.open(sitemap_url)) | |
# get all post titles | |
# for each sitemap url open it if it is not "/about" or "/archive" | |
# get all h1 titles and save them to an array | |
post_titles = [] | |
sitemap_xml.css('loc').each do |link| | |
# scrape first 5 posts | |
break if post_titles.length >= 5 | |
# if exception continue | |
begin | |
# get url | |
url = link.content | |
# if url is not about or archive | |
if url.include?('/about') || url.include?('/archive') | |
next | |
end | |
# open url | |
@browser.goto(url) | |
# get the h1 title under class 'post-title' | |
title = @browser.h1(class: 'post-title').text | |
# add h1 title to post_titles array | |
puts url, title | |
post_titles << title | |
# sleep for 4 seconds to prevent timeout | |
sleep 4 | |
rescue => e | |
puts "Error scraping #{url}; error message #{e.message}" | |
end | |
end | |
puts "Found #{post_titles.length} post titles for #{substack_url}" | |
puts post_titles | |
post_titles | |
end | |
def get_substack_description(substack_url) | |
# open substack_url / about page | |
@browser.goto substack_url + '/about' | |
# get div under class 'content-about' | |
div = @browser.div(class: 'content-about') | |
# for each element in div if it is either h or p add it to string | |
description = '' | |
div.elements.each do |element| | |
if element.tag_name == 'h1' || element.tag_name == 'h2' || element.tag_name == 'h3' || element.tag_name == 'h4' || element.tag_name == 'p' | |
# add element to description with new line | |
description += element.text + '\n' | |
end | |
end | |
description | |
end | |
def save_description_to_csv(substack_url, description, post_titles) | |
# open csv file | |
CSV.open(CSV_FILE_NAME, 'a+') do |csv| | |
# add url description and post_titles list to csv | |
csv << [substack_url, description, post_titles[0], post_titles[1], post_titles[2], post_titles[3], post_titles[4]] | |
end | |
end | |
def get_all_substacks | |
# function that gets all the substacks from the SITEMAP_URL | |
# open url | |
puts "Opening #{SITEMAP_URL}" | |
page = URI.open(SITEMAP_URL) | |
puts "Done opening #{SITEMAP_URL}" | |
# parse this page | |
xml = Nokogiri::XML(page) | |
# get all urls from xml | |
urls = xml.css('loc').map {|link| link.text} | |
# save the urls to the all_substacks array | |
@all_substacks = urls | |
# save all_substacks to csv | |
save_to_csv | |
end | |
def save_to_csv | |
# function that saves all_substacks to a csv file | |
# create csv file | |
CSV.open('substack_newsletters.csv', 'wb') do |csv| | |
# add all_substacks to csv | |
csv << @all_substacks | |
end | |
end | |
def read_csv | |
# function that reads the csv file and returns the substacks | |
# read csv file | |
# and save to all_substacks variable | |
@all_substacks = CSV.read('substack_newsletters.csv')[0] | |
end | |
end | |
substack_scraper = SubstackScraper.new | |
substack_scraper.call | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment