haljpeg/substack_scraper_nov9.rb

## substack_scraper_nov9.rb
require 'watir'
require 'nokogiri'
require 'open-uri'
require 'csv'
require 'net/http'
require 'aws-sdk-s3'

# create scraper class
# scrapes the substack website for all the newsletters available there
class SubstackScraper
    SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
    AUTOSAVE_FREQUENCY = 10
    CSV_FILE_NAME = 'substack_descriptions.csv'.freeze

    def initialize
        # create browser instance
        Watir.default_timeout = 5
        # open headless browser
        @browser = Watir::Browser.new :chrome, headless: true
        # all newsletter substack
        @all_substacks = []
    end

    def call
        # if substack_newsletters.csv exists load it
        if File.exist?('substack_newsletters.csv')
            puts 'Loading substack_newsletters.csv'
            read_csv
        else
            puts 'Loading all substack newsletters'
            # get all substacks
            get_all_substacks
        end
        puts 'Total number of newsletters found: ' + @all_substacks.length.to_s

        # clear csv file
        #File.open(CSV_FILE_NAME, 'w') { |file| file.truncate(0) }
        # read all existing urls that have descriptions
        existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]}

        # for each url scrape the description with get_substack_description function
        @all_substacks.each_with_index do |url, index|
            # if url is in the existing urls skip it
            if existing_urls.include?(url)
                puts 'Skipping ' + url
                next
            end
            # get description
            # add exception
            begin
                description = get_substack_description(url)
            rescue => e
                puts 'Error getting description for ' + url + ' ' + e.message
            end
            # get all post h1 titles from the url + sitemap.xml
            post_titles = get_all_post_titles(url)
            # save to csv
            save_description_to_csv(url, description, post_titles)
            puts "Done scraping #{url}"
            # sleep for 4 second to prevent timeout
            sleep 4
            # if 10 iterations save to aws
            if (index + 1) % AUTOSAVE_FREQUENCY == 0
                puts 'Saving to AWS'
                save_to_s3
                puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
            end
        end
    end

    def save_to_s3
        # save CSV file to AWS s3
        # create s3 client
        s3 = Aws::S3::Client.new(region: 'us-east-1')
        # upload file to s3
        s3.put_object(
            bucket: 'YOUR-BUCKET',
            key: CSV_FILE_NAME,
            body: File.open(CSV_FILE_NAME)
        )
        puts "Saved #{CSV_FILE_NAME} to AWS S3"
    end

    def get_all_post_titles(substack_url)
        # get sitemap url
        sitemap_url = substack_url + 'sitemap.xml'
        # get sitemap xml
        sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
        # get all post titles
        # for each sitemap url open it if it is not "/about" or "/archive"
        # get all h1 titles and save them to an array
        post_titles = []
        sitemap_xml.css('loc').each do |link|
            # scrape first 5 posts
            break if post_titles.length >= 5
            # if exception continue
            begin
                # get url
                url = link.content
                # if url is not about or archive
                if url.include?('/about') || url.include?('/archive')
                    next
                end
                # open url
                @browser.goto(url)
                # get the h1 title under class 'post-title'
                title = @browser.h1(class: 'post-title').text
                # add h1 title to post_titles array
                puts url, title
                post_titles << title
                # sleep for 4 seconds to prevent timeout
                sleep 4
            rescue => e
                puts "Error scraping #{url}; error message #{e.message}"
            end
        end
        puts "Found #{post_titles.length} post titles for #{substack_url}"
        puts post_titles
        post_titles
    end

    def get_substack_description(substack_url)
        # open substack_url / about page
        @browser.goto substack_url + '/about'
        # get div under class 'content-about'
        div = @browser.div(class: 'content-about')
        # for each element in div if it is either h or p add it to string
        description = ''
        div.elements.each do |element|
            if element.tag_name == 'h1' || element.tag_name == 'h2' || element.tag_name == 'h3' || element.tag_name == 'h4' || element.tag_name == 'p'
                # add element to description with new line
                description += element.text + '\n'
            end
        end
        description
    end

    def save_description_to_csv(substack_url, description, post_titles)
        # open csv file
        CSV.open(CSV_FILE_NAME, 'a+') do |csv|
            # add url description and post_titles list to csv
            csv << [substack_url, description, post_titles[0], post_titles[1], post_titles[2], post_titles[3], post_titles[4]]
        end
    end

    def get_all_substacks
        # function that gets all the substacks from the SITEMAP_URL
        # open url
        puts "Opening #{SITEMAP_URL}"
        page = URI.open(SITEMAP_URL)
        puts "Done opening #{SITEMAP_URL}"
        # parse this page
        xml = Nokogiri::XML(page)
        # get all urls from xml
        urls = xml.css('loc').map {|link| link.text}
        # save the urls to the all_substacks array
        @all_substacks = urls
        # save all_substacks to csv
        save_to_csv
    end

    def save_to_csv
        # function that saves all_substacks to a csv file
        # create csv file
        CSV.open('substack_newsletters.csv', 'wb') do |csv|
            # add all_substacks to csv
            csv << @all_substacks
        end
    end

    def read_csv
        # function that reads the csv file and returns the substacks
        # read csv file
        # and save to all_substacks variable
        @all_substacks = CSV.read('substack_newsletters.csv')[0]
    end
end


substack_scraper = SubstackScraper.new
substack_scraper.call
	require 'watir'
	require 'nokogiri'
	require 'open-uri'
	require 'csv'
	require 'net/http'
	require 'aws-sdk-s3'

	# create scraper class
	# scrapes the substack website for all the newsletters available there
	class SubstackScraper
	SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
	AUTOSAVE_FREQUENCY = 10
	CSV_FILE_NAME = 'substack_descriptions.csv'.freeze

	def initialize
	# create browser instance
	Watir.default_timeout = 5
	# open headless browser
	@browser = Watir::Browser.new :chrome, headless: true
	# all newsletter substack
	@all_substacks = []
	end

	def call
	# if substack_newsletters.csv exists load it
	if File.exist?('substack_newsletters.csv')
	puts 'Loading substack_newsletters.csv'
	read_csv
	else
	puts 'Loading all substack newsletters'
	# get all substacks
	get_all_substacks
	end
	puts 'Total number of newsletters found: ' + @all_substacks.length.to_s

	# clear csv file
	#File.open(CSV_FILE_NAME, 'w') { \|file\| file.truncate(0) }
	# read all existing urls that have descriptions
	existing_urls = CSV.read(CSV_FILE_NAME).map {\|row\| row[0]}

	# for each url scrape the description with get_substack_description function
	@all_substacks.each_with_index do \|url, index\|
	# if url is in the existing urls skip it
	if existing_urls.include?(url)
	puts 'Skipping ' + url
	next
	end
	# get description
	# add exception
	begin
	description = get_substack_description(url)
	rescue => e
	puts 'Error getting description for ' + url + ' ' + e.message
	end
	# get all post h1 titles from the url + sitemap.xml
	post_titles = get_all_post_titles(url)
	# save to csv
	save_description_to_csv(url, description, post_titles)
	puts "Done scraping #{url}"
	# sleep for 4 second to prevent timeout
	sleep 4
	# if 10 iterations save to aws
	if (index + 1) % AUTOSAVE_FREQUENCY == 0
	puts 'Saving to AWS'
	save_to_s3
	puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
	end
	end
	end

	def save_to_s3
	# save CSV file to AWS s3
	# create s3 client
	s3 = Aws::S3::Client.new(region: 'us-east-1')
	# upload file to s3
	s3.put_object(
	bucket: 'YOUR-BUCKET',
	key: CSV_FILE_NAME,
	body: File.open(CSV_FILE_NAME)
	)
	puts "Saved #{CSV_FILE_NAME} to AWS S3"
	end

	def get_all_post_titles(substack_url)
	# get sitemap url
	sitemap_url = substack_url + 'sitemap.xml'
	# get sitemap xml
	sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
	# get all post titles
	# for each sitemap url open it if it is not "/about" or "/archive"
	# get all h1 titles and save them to an array
	post_titles = []
	sitemap_xml.css('loc').each do \|link\|
	# scrape first 5 posts
	break if post_titles.length >= 5
	# if exception continue
	begin
	# get url
	url = link.content
	# if url is not about or archive
	if url.include?('/about') \|\| url.include?('/archive')
	next
	end
	# open url
	@browser.goto(url)
	# get the h1 title under class 'post-title'
	title = @browser.h1(class: 'post-title').text
	# add h1 title to post_titles array
	puts url, title
	post_titles << title
	# sleep for 4 seconds to prevent timeout
	sleep 4
	rescue => e
	puts "Error scraping #{url}; error message #{e.message}"
	end
	end
	puts "Found #{post_titles.length} post titles for #{substack_url}"
	puts post_titles
	post_titles
	end

	def get_substack_description(substack_url)
	# open substack_url / about page
	@browser.goto substack_url + '/about'
	# get div under class 'content-about'
	div = @browser.div(class: 'content-about')
	# for each element in div if it is either h or p add it to string
	description = ''
	div.elements.each do \|element\|
	if element.tag_name == 'h1' \|\| element.tag_name == 'h2' \|\| element.tag_name == 'h3' \|\| element.tag_name == 'h4' \|\| element.tag_name == 'p'
	# add element to description with new line
	description += element.text + '\n'
	end
	end
	description
	end

	def save_description_to_csv(substack_url, description, post_titles)
	# open csv file
	CSV.open(CSV_FILE_NAME, 'a+') do \|csv\|
	# add url description and post_titles list to csv
	csv << [substack_url, description, post_titles[0], post_titles[1], post_titles[2], post_titles[3], post_titles[4]]
	end
	end

	def get_all_substacks
	# function that gets all the substacks from the SITEMAP_URL
	# open url
	puts "Opening #{SITEMAP_URL}"
	page = URI.open(SITEMAP_URL)
	puts "Done opening #{SITEMAP_URL}"
	# parse this page
	xml = Nokogiri::XML(page)
	# get all urls from xml
	urls = xml.css('loc').map {\|link\| link.text}
	# save the urls to the all_substacks array
	@all_substacks = urls
	# save all_substacks to csv
	save_to_csv
	end

	def save_to_csv
	# function that saves all_substacks to a csv file
	# create csv file
	CSV.open('substack_newsletters.csv', 'wb') do \|csv\|
	# add all_substacks to csv
	csv << @all_substacks
	end
	end

	def read_csv
	# function that reads the csv file and returns the substacks
	# read csv file
	# and save to all_substacks variable
	@all_substacks = CSV.read('substack_newsletters.csv')[0]
	end
	end


	substack_scraper = SubstackScraper.new
	substack_scraper.call