Skip to content

Instantly share code, notes, and snippets.

@akuhn
Last active December 4, 2018 08:00
Show Gist options
  • Save akuhn/1fb5c3a70cc876f40594932f3d28f8a9 to your computer and use it in GitHub Desktop.
Save akuhn/1fb5c3a70cc876f40594932f3d28f8a9 to your computer and use it in GitHub Desktop.
Download photos from tumbler blog.
require 'json'
require 'net/http'
require 'nokogiri'
require 'omakase' # see omakase repo on my github
require 'pry'
require 'sqlite3'
API_KEY = JSON.parse(File.read('key_tumblr.json'))['key']
if $ARGV.length < 2
puts "Usage: ruby tumblr.rb BLOG_NAME NUM_PHOTOS"
exit
end
$blog_name = $ARGV[0]
$num_photos = $ARGV[1].to_i
$debug = $ARGV.last =~ /-d|--debug/
module Sequel
SQL = SQLite3::Database.new('data_tumblr.sqlite')
def self.create_table(name, columns)
columns[:created_at] = 'DATETIME DEFAULT CURRENT_TIMESTAMP'
body = columns.map { |name, type| "`#{name}` #{type}" }.join(', ')
SQL.execute("CREATE TABLE IF NOT EXISTS `#{name}` (#{body})")
end
def self.execute(query, *values)
SQL.execute(query, values)
end
end
module Web
HTTP = Net::HTTP.start('api.tumblr.com')
def self.get(path, params = {})
path_with_params = "#{path}?#{params.map { |a, b| "#{a}=#{b}" }.join('&')}"
p path_with_params
HTTP.get(path_with_params)
end
end
Sequel.create_table(
'posts',
post_id: 'INTEGER NOT NULL UNIQUE',
blog_name: 'TEXT NOT NULL',
payload: 'TEXT NOT NULL',
)
def http_get_posts_photos(blog_name, offset = 0, limit = 50)
r = Web.get(
"/v2/blog/#{blog_name}.tumblr.com/posts/photo",
offset: offset,
limit: limit,
reblog_info: true,
api_key: API_KEY,
)
posts = JSON.parse(r.body, symbolize_names: true).response.posts
posts.each do |post|
Sequel.execute(
'INSERT INTO posts (post_id, blog_name, payload) VALUES (?, ?, ?)',
post.id,
post.blog_name,
post.to_json,
) rescue nil
end
end
puts "Downloading metadata for #{$num_photos} photos from blog #{$blog_name}"
($num_photos / 50).times { |offset| http_get_posts_photos($blog_name, offset * 50) }
data = Sequel.execute('SELECT * FROM posts WHERE blog_name = ?', $blog_name)
puts "Processing #{data.length} posts..."
data.each do |post_id, blog_name, payload|
begin
post = JSON.parse(payload, symbolize_names: true)
next unless post[:caption] && post[:photos]
caption_text = Nokogiri::HTML(post.caption).text.strip.squeeze("\n")
puts caption_text unless caption_text.empty?
post.photos.each do |photo|
caption_text = Nokogiri::HTML(photo.caption).text.strip.squeeze("\n")
puts caption_text unless caption_text.empty?
url = photo.original_size.url
photo_suffix = url[/tumblr_(.*_.*\..*)$/, 1]
fname = "#{blog_name}/tumblr_#{blog_name}_#{post_id}_#{photo_suffix}"
next if File.exist?(fname)
p fname
Dir.mkdir(blog_name) rescue nil
File.open(fname, 'w') { |f| f.write Web.get(photo.original_size.url).body }
end
rescue => ex
binding.pry if $debug
raise
end
end
:done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment