Skip to content

Instantly share code, notes, and snippets.

@haljpeg
Created November 15, 2022 05:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haljpeg/d30b6a3dd827cc2d84016ee6d55a127e to your computer and use it in GitHub Desktop.
Save haljpeg/d30b6a3dd827cc2d84016ee6d55a127e to your computer and use it in GitHub Desktop.
Substack newsletter scraper that works
require 'nokogiri'
require 'open-uri'
require 'csv'
require 'aws-sdk-s3'
# create scraper class
# scrapes the substack website for all the newsletters available there
class SubstackScraper
SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
AUTOSAVE_FREQUENCY = 10
CSV_FILE_NAME = 'substack_descriptions.csv'.freeze
def initialize
# all newsletter substack
@all_substacks = []
end
def call
# if substack_newsletters.csv exists load it
if File.exist?('substack_newsletters.csv')
puts 'Loading substack_newsletters.csv'
read_csv
else
puts 'Loading all substack newsletters'
# get all substacks
get_all_substacks
end
puts 'Total number of newsletters found: ' + @all_substacks.length.to_s
# read all existing urls that have descriptions
begin
existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]}
rescue
existing_urls = [] # otherwise no existing urls
end
# for each url scrape the description with get_substack_description function
@all_substacks.each_with_index do |url, index|
# stop if index is greater than 4
#break if index > 40
# if url is in the existing urls skip it
if existing_urls.include?(url)
puts 'Skipping ' + url
next
end
# get description
# add exception
#begin
# description = get_substack_description(url)
#rescue => e
# puts 'Error getting description for ' + url + ' ' + e.message
#end
description=""
# get all post h1 titles from the url + sitemap.xml
begin
post_titles = get_all_post_titles(url)
# save to csv
save_description_to_csv(url, description, post_titles)
puts "Done scraping #{url}"
rescue => e
puts 'Error getting post titles for ' + url + ' ' + e.message
end
# if 10 iterations save to aws
if (index + 1) % AUTOSAVE_FREQUENCY == 0
puts 'Saving to AWS'
save_to_s3
puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
end
end
# save to aws
puts 'Saving to AWS'
save_to_s3
puts 'Total number of newsletters scraped: ' + @all_substacks.length.to_s
puts 'Done scraping all substack newsletters'
end
def save_to_s3
# save CSV file to AWS s3
# create s3 client
s3 = Aws::S3::Client.new(
region: 'us-east-1',
access_key_id: ENV['AWS_ACCESS_KEY_ID'],
secret_access_key: ENV['AWS_SECRET_ACCESS_KEY']
)
# upload file to s3
s3.put_object(
bucket: ENV['AWS_BUCKET'],
key: CSV_FILE_NAME,
body: File.open(CSV_FILE_NAME)
)
puts "Saved #{CSV_FILE_NAME} to AWS S3"
end
def get_all_post_titles(substack_url)
# get sitemap url
sitemap_url = substack_url + '/sitemap.xml'
# get sitemap xml
sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
# get all post titles
# for each sitemap url open it if it is not "/about" or "/archive"
# get all h1 titles and save them to an array
urls = [] # all urls in substack to scrape
outs = [] # all post titles and other info that we will get
threads = [] # all threads
puts "Opening #{sitemap_url}"
# get all urls first
sitemap_xml.css('loc').each do |link|
# scrape first 5 posts
break if urls.length >= 5
# get url
url = link.content
# if url is not about or archive
if url.include?('/about') || url.include?('/archive')
next
end
urls << url
end
puts "Found #{urls} in #{substack_url}"
# get all titles
urls.each do |url|
threads << Thread.new do
scrape_uri = 'http://api.scrape.do?token=' + ENV["SCRAPEDO_TOKEN"] + '&url=' + url
html_out = Nokogiri::HTML(URI.open(scrape_uri))
# get h1 with class post-title
post_title = html_out.css('h1.post-title').text
# get h3 with class subtitle
post_subtitle = html_out.css('h3.subtitle').text
# get h1 with class navbar-title
substack_name = html_out.css('h1.navbar-title').text
#puts 'Post title: ' + post_title
#puts 'Post subtitle: ' + post_subtitle
#puts 'Substack name: ' + substack_name
outs << {
url: url,
post_title: post_title,
post_subtitle: post_subtitle,
substack_name: substack_name
}
end
end
# wait for all threads to finish
threads.each { |thr| thr.join }
puts "Found #{outs.length} post titles for #{substack_url}"
# if outs.length < 5 then add empty url post_title post_subtitle substack_name to the rest
if outs.length < 5
(5 - outs.length).times do
outs << {
url: '',
post_title: '',
post_subtitle: '',
substack_name: ''
}
end
end
outs
end
def save_description_to_csv(substack_url, description, post_titles)
# open csv file
CSV.open(CSV_FILE_NAME, 'a+') do |csv|
# add url description and post_titles list to csv
csv << [substack_url, description,
post_titles[0][:url], post_titles[0][:post_title], post_titles[0][:post_subtitle], post_titles[0][:substack_name],
post_titles[1][:url], post_titles[1][:post_title], post_titles[1][:post_subtitle], post_titles[1][:substack_name],
post_titles[2][:url], post_titles[2][:post_title], post_titles[2][:post_subtitle], post_titles[2][:substack_name],
post_titles[3][:url], post_titles[3][:post_title], post_titles[3][:post_subtitle], post_titles[3][:substack_name],
post_titles[4][:url], post_titles[4][:post_title], post_titles[4][:post_subtitle], post_titles[4][:substack_name]
]
end
end
def get_all_substacks
# function that gets all the substacks from the SITEMAP_URL
# open url
puts "Opening #{SITEMAP_URL}"
page = URI.open(SITEMAP_URL)
puts "Done opening #{SITEMAP_URL}"
# parse this page
xml = Nokogiri::XML(page)
# get all urls from xml
urls = xml.css('loc').map {|link| link.text}
# save the urls to the all_substacks array
@all_substacks = urls
# save all_substacks to csv
save_to_csv
end
def save_to_csv
# function that saves all_substacks to a csv file
# create csv file
CSV.open('substack_newsletters.csv', 'wb') do |csv|
# add all_substacks to csv
csv << @all_substacks
end
end
def read_csv
# function that reads the csv file and returns the substacks
# read csv file
# and save to all_substacks variable
@all_substacks = CSV.read('substack_newsletters.csv')[0]
end
end
substack_scraper = SubstackScraper.new
substack_scraper.call
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment