Created
November 15, 2022 05:11
-
-
Save haljpeg/d30b6a3dd827cc2d84016ee6d55a127e to your computer and use it in GitHub Desktop.
Substack newsletter scraper that works
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
require 'csv' | |
require 'aws-sdk-s3' | |
# create scraper class | |
# scrapes the substack website for all the newsletters available there | |
class SubstackScraper | |
SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze | |
AUTOSAVE_FREQUENCY = 10 | |
CSV_FILE_NAME = 'substack_descriptions.csv'.freeze | |
def initialize | |
# all newsletter substack | |
@all_substacks = [] | |
end | |
def call | |
# if substack_newsletters.csv exists load it | |
if File.exist?('substack_newsletters.csv') | |
puts 'Loading substack_newsletters.csv' | |
read_csv | |
else | |
puts 'Loading all substack newsletters' | |
# get all substacks | |
get_all_substacks | |
end | |
puts 'Total number of newsletters found: ' + @all_substacks.length.to_s | |
# read all existing urls that have descriptions | |
begin | |
existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]} | |
rescue | |
existing_urls = [] # otherwise no existing urls | |
end | |
# for each url scrape the description with get_substack_description function | |
@all_substacks.each_with_index do |url, index| | |
# stop if index is greater than 4 | |
#break if index > 40 | |
# if url is in the existing urls skip it | |
if existing_urls.include?(url) | |
puts 'Skipping ' + url | |
next | |
end | |
# get description | |
# add exception | |
#begin | |
# description = get_substack_description(url) | |
#rescue => e | |
# puts 'Error getting description for ' + url + ' ' + e.message | |
#end | |
description="" | |
# get all post h1 titles from the url + sitemap.xml | |
begin | |
post_titles = get_all_post_titles(url) | |
# save to csv | |
save_description_to_csv(url, description, post_titles) | |
puts "Done scraping #{url}" | |
rescue => e | |
puts 'Error getting post titles for ' + url + ' ' + e.message | |
end | |
# if 10 iterations save to aws | |
if (index + 1) % AUTOSAVE_FREQUENCY == 0 | |
puts 'Saving to AWS' | |
save_to_s3 | |
puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s | |
end | |
end | |
# save to aws | |
puts 'Saving to AWS' | |
save_to_s3 | |
puts 'Total number of newsletters scraped: ' + @all_substacks.length.to_s | |
puts 'Done scraping all substack newsletters' | |
end | |
def save_to_s3 | |
# save CSV file to AWS s3 | |
# create s3 client | |
s3 = Aws::S3::Client.new( | |
region: 'us-east-1', | |
access_key_id: ENV['AWS_ACCESS_KEY_ID'], | |
secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'] | |
) | |
# upload file to s3 | |
s3.put_object( | |
bucket: ENV['AWS_BUCKET'], | |
key: CSV_FILE_NAME, | |
body: File.open(CSV_FILE_NAME) | |
) | |
puts "Saved #{CSV_FILE_NAME} to AWS S3" | |
end | |
def get_all_post_titles(substack_url) | |
# get sitemap url | |
sitemap_url = substack_url + '/sitemap.xml' | |
# get sitemap xml | |
sitemap_xml = Nokogiri::XML(URI.open(sitemap_url)) | |
# get all post titles | |
# for each sitemap url open it if it is not "/about" or "/archive" | |
# get all h1 titles and save them to an array | |
urls = [] # all urls in substack to scrape | |
outs = [] # all post titles and other info that we will get | |
threads = [] # all threads | |
puts "Opening #{sitemap_url}" | |
# get all urls first | |
sitemap_xml.css('loc').each do |link| | |
# scrape first 5 posts | |
break if urls.length >= 5 | |
# get url | |
url = link.content | |
# if url is not about or archive | |
if url.include?('/about') || url.include?('/archive') | |
next | |
end | |
urls << url | |
end | |
puts "Found #{urls} in #{substack_url}" | |
# get all titles | |
urls.each do |url| | |
threads << Thread.new do | |
scrape_uri = 'http://api.scrape.do?token=' + ENV["SCRAPEDO_TOKEN"] + '&url=' + url | |
html_out = Nokogiri::HTML(URI.open(scrape_uri)) | |
# get h1 with class post-title | |
post_title = html_out.css('h1.post-title').text | |
# get h3 with class subtitle | |
post_subtitle = html_out.css('h3.subtitle').text | |
# get h1 with class navbar-title | |
substack_name = html_out.css('h1.navbar-title').text | |
#puts 'Post title: ' + post_title | |
#puts 'Post subtitle: ' + post_subtitle | |
#puts 'Substack name: ' + substack_name | |
outs << { | |
url: url, | |
post_title: post_title, | |
post_subtitle: post_subtitle, | |
substack_name: substack_name | |
} | |
end | |
end | |
# wait for all threads to finish | |
threads.each { |thr| thr.join } | |
puts "Found #{outs.length} post titles for #{substack_url}" | |
# if outs.length < 5 then add empty url post_title post_subtitle substack_name to the rest | |
if outs.length < 5 | |
(5 - outs.length).times do | |
outs << { | |
url: '', | |
post_title: '', | |
post_subtitle: '', | |
substack_name: '' | |
} | |
end | |
end | |
outs | |
end | |
def save_description_to_csv(substack_url, description, post_titles) | |
# open csv file | |
CSV.open(CSV_FILE_NAME, 'a+') do |csv| | |
# add url description and post_titles list to csv | |
csv << [substack_url, description, | |
post_titles[0][:url], post_titles[0][:post_title], post_titles[0][:post_subtitle], post_titles[0][:substack_name], | |
post_titles[1][:url], post_titles[1][:post_title], post_titles[1][:post_subtitle], post_titles[1][:substack_name], | |
post_titles[2][:url], post_titles[2][:post_title], post_titles[2][:post_subtitle], post_titles[2][:substack_name], | |
post_titles[3][:url], post_titles[3][:post_title], post_titles[3][:post_subtitle], post_titles[3][:substack_name], | |
post_titles[4][:url], post_titles[4][:post_title], post_titles[4][:post_subtitle], post_titles[4][:substack_name] | |
] | |
end | |
end | |
def get_all_substacks | |
# function that gets all the substacks from the SITEMAP_URL | |
# open url | |
puts "Opening #{SITEMAP_URL}" | |
page = URI.open(SITEMAP_URL) | |
puts "Done opening #{SITEMAP_URL}" | |
# parse this page | |
xml = Nokogiri::XML(page) | |
# get all urls from xml | |
urls = xml.css('loc').map {|link| link.text} | |
# save the urls to the all_substacks array | |
@all_substacks = urls | |
# save all_substacks to csv | |
save_to_csv | |
end | |
def save_to_csv | |
# function that saves all_substacks to a csv file | |
# create csv file | |
CSV.open('substack_newsletters.csv', 'wb') do |csv| | |
# add all_substacks to csv | |
csv << @all_substacks | |
end | |
end | |
def read_csv | |
# function that reads the csv file and returns the substacks | |
# read csv file | |
# and save to all_substacks variable | |
@all_substacks = CSV.read('substack_newsletters.csv')[0] | |
end | |
end | |
substack_scraper = SubstackScraper.new | |
substack_scraper.call |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment