Last active
December 4, 2018 08:00
-
-
Save akuhn/1fb5c3a70cc876f40594932f3d28f8a9 to your computer and use it in GitHub Desktop.
Download photos from tumbler blog.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'json' | |
require 'net/http' | |
require 'nokogiri' | |
require 'omakase' # see omakase repo on my github | |
require 'pry' | |
require 'sqlite3' | |
API_KEY = JSON.parse(File.read('key_tumblr.json'))['key'] | |
if $ARGV.length < 2 | |
puts "Usage: ruby tumblr.rb BLOG_NAME NUM_PHOTOS" | |
exit | |
end | |
$blog_name = $ARGV[0] | |
$num_photos = $ARGV[1].to_i | |
$debug = $ARGV.last =~ /-d|--debug/ | |
module Sequel | |
SQL = SQLite3::Database.new('data_tumblr.sqlite') | |
def self.create_table(name, columns) | |
columns[:created_at] = 'DATETIME DEFAULT CURRENT_TIMESTAMP' | |
body = columns.map { |name, type| "`#{name}` #{type}" }.join(', ') | |
SQL.execute("CREATE TABLE IF NOT EXISTS `#{name}` (#{body})") | |
end | |
def self.execute(query, *values) | |
SQL.execute(query, values) | |
end | |
end | |
module Web | |
HTTP = Net::HTTP.start('api.tumblr.com') | |
def self.get(path, params = {}) | |
path_with_params = "#{path}?#{params.map { |a, b| "#{a}=#{b}" }.join('&')}" | |
p path_with_params | |
HTTP.get(path_with_params) | |
end | |
end | |
Sequel.create_table( | |
'posts', | |
post_id: 'INTEGER NOT NULL UNIQUE', | |
blog_name: 'TEXT NOT NULL', | |
payload: 'TEXT NOT NULL', | |
) | |
def http_get_posts_photos(blog_name, offset = 0, limit = 50) | |
r = Web.get( | |
"/v2/blog/#{blog_name}.tumblr.com/posts/photo", | |
offset: offset, | |
limit: limit, | |
reblog_info: true, | |
api_key: API_KEY, | |
) | |
posts = JSON.parse(r.body, symbolize_names: true).response.posts | |
posts.each do |post| | |
Sequel.execute( | |
'INSERT INTO posts (post_id, blog_name, payload) VALUES (?, ?, ?)', | |
post.id, | |
post.blog_name, | |
post.to_json, | |
) rescue nil | |
end | |
end | |
puts "Downloading metadata for #{$num_photos} photos from blog #{$blog_name}" | |
($num_photos / 50).times { |offset| http_get_posts_photos($blog_name, offset * 50) } | |
data = Sequel.execute('SELECT * FROM posts WHERE blog_name = ?', $blog_name) | |
puts "Processing #{data.length} posts..." | |
data.each do |post_id, blog_name, payload| | |
begin | |
post = JSON.parse(payload, symbolize_names: true) | |
next unless post[:caption] && post[:photos] | |
caption_text = Nokogiri::HTML(post.caption).text.strip.squeeze("\n") | |
puts caption_text unless caption_text.empty? | |
post.photos.each do |photo| | |
caption_text = Nokogiri::HTML(photo.caption).text.strip.squeeze("\n") | |
puts caption_text unless caption_text.empty? | |
url = photo.original_size.url | |
photo_suffix = url[/tumblr_(.*_.*\..*)$/, 1] | |
fname = "#{blog_name}/tumblr_#{blog_name}_#{post_id}_#{photo_suffix}" | |
next if File.exist?(fname) | |
p fname | |
Dir.mkdir(blog_name) rescue nil | |
File.open(fname, 'w') { |f| f.write Web.get(photo.original_size.url).body } | |
end | |
rescue => ex | |
binding.pry if $debug | |
raise | |
end | |
end | |
:done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment