Instantly share code, notes, and snippets.

Embed
What would you like to do?
Crawl all data using openBD API
class CreateOpenBds < ActiveRecord::Migration
def change
create_table :open_bds do |t|
t.string :isbn, null: false
t.jsonb :content
t.datetime :last_crawled_at
t.timestamps null: false
end
add_index :open_bds, :isbn, unique: true
add_index :open_bds, :content, using: :gin
add_index :open_bds, :last_crawled_at
add_index :open_bds, :created_at
add_index :open_bds, :updated_at
end
end
class OpenBd < ActiveRecord::Base
concerning :CrawlerFeature do
included do
scope :not_crawled, -> { where(last_crawled_at: nil) }
scope :expired, -> now = nil {
now ||= Time.zone.now
where('last_crawled_at is null OR last_crawled_at < ?', now - 7.days)
}
end
class_methods do
def crawl!(now = nil)
now ||= Time.zone.now
generate_records
crawl_in_batches(now)
end
private
def generate_records
all_isbns = coverage
all_isbns.each_slice(10000).each do |isbns|
saved_isbns = self.where(isbn: isbns).pluck(:isbn)
new_isbns = isbns - saved_isbns
new_isbns.each do |new_isbn|
self.where(isbn: new_isbn).create
end
end
end
def crawl_in_batches(now = nil)
now ||= Time.zone.now
self.expired(now).find_in_batches(batch_size: 5000).each do |batch|
isbn2rec = batch.index_by(&:isbn)
isbns = isbn2rec.keys.dup
res = get(isbns)
res.each do |data|
if data.present? && data['summary'].present?
isbn = data['summary']['isbn']
isbns -= [isbn]
rec = isbn2rec[isbn]
rec.update last_crawled_at: Time.zone.now, content: data
else
puts "Empty data!"
end
end
if isbns.present?
File.open('log/batch.log', 'a+') { |f|
f.puts "NOT LOADED ISBNS=#{isbns}"
}
end
end
end
def coverage
response = conn.get '/v1/coverage'
JSON.parse(response.body)
end
def get(isbns)
response = conn.post '/v1/get', isbn: isbns.join(',')
JSON.parse(response.body)
end
def conn
Faraday.new(url: 'https://api.openbd.jp')
end
end
end
end
@chsh

This comment has been minimized.

Show comment
Hide comment
@chsh

chsh Jan 25, 2017

usage

OpenBd.crawl!
Owner

chsh commented Jan 25, 2017

usage

OpenBd.crawl!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment