haljpeg/scraper.rb

## scraper.rb
require 'nokogiri'
require 'open-uri'
require 'csv'
require 'aws-sdk-s3'

# create scraper class
# scrapes the substack website for all the newsletters available there
class SubstackScraper
    SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
    AUTOSAVE_FREQUENCY = 10
    CSV_FILE_NAME = 'substack_descriptions.csv'.freeze

    def initialize
        # all newsletter substack
        @all_substacks = []
    end

    def call
        # if substack_newsletters.csv exists load it
        if File.exist?('substack_newsletters.csv')
            puts 'Loading substack_newsletters.csv'
            read_csv
        else
            puts 'Loading all substack newsletters'
            # get all substacks
            get_all_substacks
        end
        puts 'Total number of newsletters found: ' + @all_substacks.length.to_s

        # read all existing urls that have descriptions
        begin
            existing_urls = CSV.read(CSV_FILE_NAME).map {|row| row[0]}
        rescue
            existing_urls = [] # otherwise no existing urls
        end

        # for each url scrape the description with get_substack_description function
        @all_substacks.each_with_index do |url, index|
            # stop if index is greater than 4
            #break if index > 40
            # if url is in the existing urls skip it
            if existing_urls.include?(url)
                puts 'Skipping ' + url
                next
            end
            # get description
            # add exception
            #begin
            #    description = get_substack_description(url)
            #rescue => e
            #    puts 'Error getting description for ' + url + ' ' + e.message
            #end
            description=""
            # get all post h1 titles from the url + sitemap.xml
            begin
                post_titles = get_all_post_titles(url)
                # save to csv
                save_description_to_csv(url, description, post_titles)
                puts "Done scraping #{url}"
            rescue => e
                puts 'Error getting post titles for ' + url + ' ' + e.message
            end
            # if 10 iterations save to aws
            if (index + 1) % AUTOSAVE_FREQUENCY == 0
                puts 'Saving to AWS'
                save_to_s3
                puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
            end
        end
        # save to aws
        puts 'Saving to AWS'
        save_to_s3
        puts 'Total number of newsletters scraped: ' + @all_substacks.length.to_s
        puts 'Done scraping all substack newsletters'
    end

    def save_to_s3
        # save CSV file to AWS s3
        # create s3 client
        s3 = Aws::S3::Client.new(
            region: 'us-east-1',
            access_key_id: ENV['AWS_ACCESS_KEY_ID'],
            secret_access_key: ENV['AWS_SECRET_ACCESS_KEY']
        )
        # upload file to s3
        s3.put_object(
            bucket: ENV['AWS_BUCKET'],
            key: CSV_FILE_NAME,
            body: File.open(CSV_FILE_NAME)
        )
        puts "Saved #{CSV_FILE_NAME} to AWS S3"
    end

    def get_all_post_titles(substack_url)
        # get sitemap url
        sitemap_url = substack_url + '/sitemap.xml'
        # get sitemap xml
        sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
        # get all post titles
        # for each sitemap url open it if it is not "/about" or "/archive"
        # get all h1 titles and save them to an array

        urls = [] # all urls in substack to scrape
        outs = [] # all post titles and other info that we will get
        threads = [] # all threads

        puts "Opening #{sitemap_url}"

        # get all urls first
        sitemap_xml.css('loc').each do |link|
            # scrape first 5 posts
            break if urls.length >= 5
            # get url
            url = link.content
            # if url is not about or archive
            if url.include?('/about') || url.include?('/archive')
                next
            end
            urls << url
        end
        puts "Found #{urls} in #{substack_url}"

        # get all titles
        urls.each do |url|
            threads << Thread.new do
                scrape_uri = 'http://api.scrape.do?token=' + ENV["SCRAPEDO_TOKEN"] + '&url=' + url
                html_out = Nokogiri::HTML(URI.open(scrape_uri))
                # get h1 with class post-title
                post_title = html_out.css('h1.post-title').text
                # get h3 with class subtitle
                post_subtitle = html_out.css('h3.subtitle').text
                # get h1 with class navbar-title
                substack_name = html_out.css('h1.navbar-title').text
                #puts 'Post title: ' + post_title
                #puts 'Post subtitle: ' + post_subtitle
                #puts 'Substack name: ' + substack_name

                outs << {
                    url: url,
                    post_title: post_title,
                    post_subtitle: post_subtitle,
                    substack_name: substack_name
                }
            end
        end

        # wait for all threads to finish
        threads.each { |thr| thr.join }

        puts "Found #{outs.length} post titles for #{substack_url}"
        # if outs.length < 5 then add empty url post_title post_subtitle substack_name to the rest
        if outs.length < 5
            (5 - outs.length).times do
                outs << {
                    url: '',
                    post_title: '',
                    post_subtitle: '',
                    substack_name: ''
                }
            end
        end

        outs
    end

    def save_description_to_csv(substack_url, description, post_titles)
        # open csv file
        CSV.open(CSV_FILE_NAME, 'a+') do |csv|
            # add url description and post_titles list to csv
            csv << [substack_url, description,
                post_titles[0][:url], post_titles[0][:post_title], post_titles[0][:post_subtitle], post_titles[0][:substack_name],
                post_titles[1][:url], post_titles[1][:post_title], post_titles[1][:post_subtitle], post_titles[1][:substack_name],
                post_titles[2][:url], post_titles[2][:post_title], post_titles[2][:post_subtitle], post_titles[2][:substack_name],
                post_titles[3][:url], post_titles[3][:post_title], post_titles[3][:post_subtitle], post_titles[3][:substack_name],
                post_titles[4][:url], post_titles[4][:post_title], post_titles[4][:post_subtitle], post_titles[4][:substack_name]
            ]
        end
    end

    def get_all_substacks
        # function that gets all the substacks from the SITEMAP_URL
        # open url
        puts "Opening #{SITEMAP_URL}"
        page = URI.open(SITEMAP_URL)
        puts "Done opening #{SITEMAP_URL}"
        # parse this page
        xml = Nokogiri::XML(page)
        # get all urls from xml
        urls = xml.css('loc').map {|link| link.text}
        # save the urls to the all_substacks array
        @all_substacks = urls
        # save all_substacks to csv
        save_to_csv
    end

    def save_to_csv
        # function that saves all_substacks to a csv file
        # create csv file
        CSV.open('substack_newsletters.csv', 'wb') do |csv|
            # add all_substacks to csv
            csv << @all_substacks
        end
    end

    def read_csv
        # function that reads the csv file and returns the substacks
        # read csv file
        # and save to all_substacks variable
        @all_substacks = CSV.read('substack_newsletters.csv')[0]
    end
end


substack_scraper = SubstackScraper.new
substack_scraper.call
	require 'nokogiri'
	require 'open-uri'
	require 'csv'
	require 'aws-sdk-s3'

	# create scraper class
	# scrapes the substack website for all the newsletters available there
	class SubstackScraper
	SITEMAP_URL = 'https://substack.com/sitemap-tt.xml.gz'.freeze
	AUTOSAVE_FREQUENCY = 10
	CSV_FILE_NAME = 'substack_descriptions.csv'.freeze

	def initialize
	# all newsletter substack
	@all_substacks = []
	end

	def call
	# if substack_newsletters.csv exists load it
	if File.exist?('substack_newsletters.csv')
	puts 'Loading substack_newsletters.csv'
	read_csv
	else
	puts 'Loading all substack newsletters'
	# get all substacks
	get_all_substacks
	end
	puts 'Total number of newsletters found: ' + @all_substacks.length.to_s

	# read all existing urls that have descriptions
	begin
	existing_urls = CSV.read(CSV_FILE_NAME).map {\|row\| row[0]}
	rescue
	existing_urls = [] # otherwise no existing urls
	end

	# for each url scrape the description with get_substack_description function
	@all_substacks.each_with_index do \|url, index\|
	# stop if index is greater than 4
	#break if index > 40
	# if url is in the existing urls skip it
	if existing_urls.include?(url)
	puts 'Skipping ' + url
	next
	end
	# get description
	# add exception
	#begin
	# description = get_substack_description(url)
	#rescue => e
	# puts 'Error getting description for ' + url + ' ' + e.message
	#end
	description=""
	# get all post h1 titles from the url + sitemap.xml
	begin
	post_titles = get_all_post_titles(url)
	# save to csv
	save_description_to_csv(url, description, post_titles)
	puts "Done scraping #{url}"
	rescue => e
	puts 'Error getting post titles for ' + url + ' ' + e.message
	end
	# if 10 iterations save to aws
	if (index + 1) % AUTOSAVE_FREQUENCY == 0
	puts 'Saving to AWS'
	save_to_s3
	puts 'Total number of newsletters scraped: ' + (index + 1).to_s + ' out of ' + @all_substacks.length.to_s
	end
	end
	# save to aws
	puts 'Saving to AWS'
	save_to_s3
	puts 'Total number of newsletters scraped: ' + @all_substacks.length.to_s
	puts 'Done scraping all substack newsletters'
	end

	def save_to_s3
	# save CSV file to AWS s3
	# create s3 client
	s3 = Aws::S3::Client.new(
	region: 'us-east-1',
	access_key_id: ENV['AWS_ACCESS_KEY_ID'],
	secret_access_key: ENV['AWS_SECRET_ACCESS_KEY']
	)
	# upload file to s3
	s3.put_object(
	bucket: ENV['AWS_BUCKET'],
	key: CSV_FILE_NAME,
	body: File.open(CSV_FILE_NAME)
	)
	puts "Saved #{CSV_FILE_NAME} to AWS S3"
	end

	def get_all_post_titles(substack_url)
	# get sitemap url
	sitemap_url = substack_url + '/sitemap.xml'
	# get sitemap xml
	sitemap_xml = Nokogiri::XML(URI.open(sitemap_url))
	# get all post titles
	# for each sitemap url open it if it is not "/about" or "/archive"
	# get all h1 titles and save them to an array

	urls = [] # all urls in substack to scrape
	outs = [] # all post titles and other info that we will get
	threads = [] # all threads

	puts "Opening #{sitemap_url}"

	# get all urls first
	sitemap_xml.css('loc').each do \|link\|
	# scrape first 5 posts
	break if urls.length >= 5
	# get url
	url = link.content
	# if url is not about or archive
	if url.include?('/about') \|\| url.include?('/archive')
	next
	end
	urls << url
	end
	puts "Found #{urls} in #{substack_url}"

	# get all titles
	urls.each do \|url\|
	threads << Thread.new do
	scrape_uri = 'http://api.scrape.do?token=' + ENV["SCRAPEDO_TOKEN"] + '&url=' + url
	html_out = Nokogiri::HTML(URI.open(scrape_uri))
	# get h1 with class post-title
	post_title = html_out.css('h1.post-title').text
	# get h3 with class subtitle
	post_subtitle = html_out.css('h3.subtitle').text
	# get h1 with class navbar-title
	substack_name = html_out.css('h1.navbar-title').text
	#puts 'Post title: ' + post_title
	#puts 'Post subtitle: ' + post_subtitle
	#puts 'Substack name: ' + substack_name

	outs << {
	url: url,
	post_title: post_title,
	post_subtitle: post_subtitle,
	substack_name: substack_name
	}
	end
	end

	# wait for all threads to finish
	threads.each { \|thr\| thr.join }

	puts "Found #{outs.length} post titles for #{substack_url}"
	# if outs.length < 5 then add empty url post_title post_subtitle substack_name to the rest
	if outs.length < 5
	(5 - outs.length).times do
	outs << {
	url: '',
	post_title: '',
	post_subtitle: '',
	substack_name: ''
	}
	end
	end

	outs
	end

	def save_description_to_csv(substack_url, description, post_titles)
	# open csv file
	CSV.open(CSV_FILE_NAME, 'a+') do \|csv\|
	# add url description and post_titles list to csv
	csv << [substack_url, description,
	post_titles[0][:url], post_titles[0][:post_title], post_titles[0][:post_subtitle], post_titles[0][:substack_name],
	post_titles[1][:url], post_titles[1][:post_title], post_titles[1][:post_subtitle], post_titles[1][:substack_name],
	post_titles[2][:url], post_titles[2][:post_title], post_titles[2][:post_subtitle], post_titles[2][:substack_name],
	post_titles[3][:url], post_titles[3][:post_title], post_titles[3][:post_subtitle], post_titles[3][:substack_name],
	post_titles[4][:url], post_titles[4][:post_title], post_titles[4][:post_subtitle], post_titles[4][:substack_name]
	]
	end
	end

	def get_all_substacks
	# function that gets all the substacks from the SITEMAP_URL
	# open url
	puts "Opening #{SITEMAP_URL}"
	page = URI.open(SITEMAP_URL)
	puts "Done opening #{SITEMAP_URL}"
	# parse this page
	xml = Nokogiri::XML(page)
	# get all urls from xml
	urls = xml.css('loc').map {\|link\| link.text}
	# save the urls to the all_substacks array
	@all_substacks = urls
	# save all_substacks to csv
	save_to_csv
	end

	def save_to_csv
	# function that saves all_substacks to a csv file
	# create csv file
	CSV.open('substack_newsletters.csv', 'wb') do \|csv\|
	# add all_substacks to csv
	csv << @all_substacks
	end
	end

	def read_csv
	# function that reads the csv file and returns the substacks
	# read csv file
	# and save to all_substacks variable
	@all_substacks = CSV.read('substack_newsletters.csv')[0]
	end
	end


	substack_scraper = SubstackScraper.new
	substack_scraper.call