Skip to content

Instantly share code, notes, and snippets.

@haljpeg
Last active November 9, 2022 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haljpeg/fb15cd54f1fa29abab805d218ff21c8c to your computer and use it in GitHub Desktop.
Save haljpeg/fb15cd54f1fa29abab805d218ff21c8c to your computer and use it in GitHub Desktop.
substack_scraper_nov9
require 'watir'
require 'nokogiri'
require 'open-uri'
require 'csv'
require 'net/http'
require 'aws-sdk-s3'
# create scraper class
# scrapes the substack website for all the newsletters available there
class SubstackScraper
SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
AUTOSAVE_FREQUENCY = 10
CSV_FILE_NAME = 'substack_descriptions.csv'.freeze
def initialize
# create browser instance
Watir.default_timeout = 5
# open headless browser
@browser = Watir::Browser.new :chrome, headless: true
# all newsletter substack
@all_substacks = []
end
def call
# if substack_newsletters.csv exists load it
if File.exist?('substack_newsletters.csv')
puts 'Loading substack_newsletters.csv'
read_csv
else
puts 'Loading all substack newsletters'
# get all substacks
get_all_substacks
end
puts 'Total number of newsletters found: ' + @all_substacks.length.to_s
# clear csv file
#File.open(CSV_FILE_NAME, 'w') { |file| file.truncate(0) }
# read all existing urls that have descriptions
existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]}
# for each url scrape the description with get_substack_description function
@all_substacks.each_with_index do |url, index|
# if url is in the existing urls skip it
if existing_urls.include?(url)
puts 'Skipping ' + url
next
end
# get description
# add exception
begin
description = get_substack_description(url)
rescue => e
puts 'Error getting description for ' + url + ' ' + e.message
end
# get all post h1 titles from the url + sitemap.xml
post_titles = get_all_post_titles(url)
# save to csv
save_description_to_csv(url, description, post_titles)
puts "Done scraping #{url}"
# sleep for 4 second to prevent timeout
sleep 4
# if 10 iterations save to aws
if (index + 1) % AUTOSAVE_FREQUENCY == 0
puts 'Saving to AWS'
save_to_s3
puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
end
end
end
def save_to_s3
# save CSV file to AWS s3
# create s3 client
s3 = Aws::S3::Client.new(region: 'us-east-1')
# upload file to s3
s3.put_object(
bucket: 'YOUR-BUCKET',
key: CSV_FILE_NAME,
body: File.open(CSV_FILE_NAME)
)
puts "Saved #{CSV_FILE_NAME} to AWS S3"
end
def get_all_post_titles(substack_url)
# get sitemap url
sitemap_url = substack_url + 'sitemap.xml'
# get sitemap xml
sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
# get all post titles
# for each sitemap url open it if it is not "/about" or "/archive"
# get all h1 titles and save them to an array
post_titles = []
sitemap_xml.css('loc').each do |link|
# scrape first 5 posts
break if post_titles.length >= 5
# if exception continue
begin
# get url
url = link.content
# if url is not about or archive
if url.include?('/about') || url.include?('/archive')
next
end
# open url
@browser.goto(url)
# get the h1 title under class 'post-title'
title = @browser.h1(class: 'post-title').text
# add h1 title to post_titles array
puts url, title
post_titles << title
# sleep for 4 seconds to prevent timeout
sleep 4
rescue => e
puts "Error scraping #{url}; error message #{e.message}"
end
end
puts "Found #{post_titles.length} post titles for #{substack_url}"
puts post_titles
post_titles
end
def get_substack_description(substack_url)
# open substack_url / about page
@browser.goto substack_url + '/about'
# get div under class 'content-about'
div = @browser.div(class: 'content-about')
# for each element in div if it is either h or p add it to string
description = ''
div.elements.each do |element|
if element.tag_name == 'h1' || element.tag_name == 'h2' || element.tag_name == 'h3' || element.tag_name == 'h4' || element.tag_name == 'p'
# add element to description with new line
description += element.text + '\n'
end
end
description
end
def save_description_to_csv(substack_url, description, post_titles)
# open csv file
CSV.open(CSV_FILE_NAME, 'a+') do |csv|
# add url description and post_titles list to csv
csv << [substack_url, description, post_titles[0], post_titles[1], post_titles[2], post_titles[3], post_titles[4]]
end
end
def get_all_substacks
# function that gets all the substacks from the SITEMAP_URL
# open url
puts "Opening #{SITEMAP_URL}"
page = URI.open(SITEMAP_URL)
puts "Done opening #{SITEMAP_URL}"
# parse this page
xml = Nokogiri::XML(page)
# get all urls from xml
urls = xml.css('loc').map {|link| link.text}
# save the urls to the all_substacks array
@all_substacks = urls
# save all_substacks to csv
save_to_csv
end
def save_to_csv
# function that saves all_substacks to a csv file
# create csv file
CSV.open('substack_newsletters.csv', 'wb') do |csv|
# add all_substacks to csv
csv << @all_substacks
end
end
def read_csv
# function that reads the csv file and returns the substacks
# read csv file
# and save to all_substacks variable
@all_substacks = CSV.read('substack_newsletters.csv')[0]
end
end
substack_scraper = SubstackScraper.new
substack_scraper.call
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment