Skip to content

Instantly share code, notes, and snippets.

@epitron
Last active October 17, 2017 02:37
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save epitron/10011220 to your computer and use it in GitHub Desktop.
Save epitron/10011220 to your computer and use it in GitHub Desktop.
An Instapaper scraper.
#!/usr/bin/env ruby
require 'mechanize'
USERNAME = ""
PASSWORD = ""
# TODO: Save cookies with "http.cookie_jar.{load,save} filename"
# TODO: Store password in ~/.config or some kind of wallet
http = Mechanize.new do |a|
a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.149 Safari/537.36"
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
puts "Logging in..."
page = http.get("http://www.instapaper.com/")
page = page.links.find { |l| l.text["Sign In"] }.click
page = page.form_with action: "/user/login" do |form|
form.username = USERNAME
form.password = PASSWORD
end.click_button
puts "Clicking /user"
page = page.link_with(href: "/user").click
puts "Clicking /user/export"
page = page.link_with(href: "/user/export").click
puts "Clicking /export/csv"
csv = page.form_with(action: "/export/csv").click_button
date = Time.now.strftime("%Y-%m-%d")
outfile = File.expand_path "~/backup/instapaper/instapaper-#{date}.csv"
puts "Saving to #{outfile.inspect}..."
bytes = csv.save! outfile
puts "Done! (#{bytes} bytes written)"
system("~/backup/instapaper/merge")
#!/usr/bin/env ruby
require 'epitools'
all_csvs = Path["~/backup/instapaper/instapaper-20*.csv"].
sort.
map { |path| [path, CSV.read(path)] }
def find_row(needle, haystack)
# p needle: needle
# raise if haystack.size > 1
haystack.index { |row| row[0..1] == needle[0..1] }
end
puts
puts "* Merging #{all_csvs.size} CSVs..."
puts
merged = []
all_csvs.each do |path, csv|
puts "* #{path}"
headers = csv.shift
merged << headers if merged.empty?
csv.reverse!
enum = csv.to_enum
count = 0
loop do
row = enum.next
if merge_pos = find_row(row, merged)
puts " |_ Overlap at row #{merge_pos}"
merged[merge_pos..-1] = [] # trim
merged += csv
break
end
count += 1
#p try: count
if count >= 10
#p :pos_not_found
puts " |_ No overlap"
merged += csv
break
end
end
end
outfile = File.expand_path "~/backup/instapaper.csv"
puts
puts "* Done! Total rows: #{merged.size}"
puts "* Writing to #{outfile}..."
CSV.open(outfile, "w") do |csv|
merged.each {|row| csv << row }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment