Skip to content

Instantly share code, notes, and snippets.

@chsh
Last active January 12, 2021 22:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chsh/19051bd89d9b4eaf4066a9f3a40c5ae8 to your computer and use it in GitHub Desktop.
Save chsh/19051bd89d9b4eaf4066a9f3a40c5ae8 to your computer and use it in GitHub Desktop.
Crawl all data using openBD API
class CreateOpenBds < ActiveRecord::Migration
def change
create_table :open_bds do |t|
t.string :isbn, null: false
t.jsonb :content
t.datetime :last_crawled_at
t.timestamps null: false
end
add_index :open_bds, :isbn, unique: true
add_index :open_bds, :content, using: :gin
add_index :open_bds, :last_crawled_at
add_index :open_bds, :created_at
add_index :open_bds, :updated_at
end
end
class OpenBd < ActiveRecord::Base
concerning :CrawlerFeature do
included do
scope :not_crawled, -> { where(last_crawled_at: nil) }
scope :expired, -> now = nil {
now ||= Time.zone.now
where('last_crawled_at is null OR last_crawled_at < ?', now - 7.days)
}
end
class_methods do
def crawl!(now = nil)
now ||= Time.zone.now
generate_records
crawl_in_batches(now)
end
private
def generate_records
all_isbns = coverage
all_isbns.each_slice(10000).each do |isbns|
saved_isbns = self.where(isbn: isbns).pluck(:isbn)
new_isbns = isbns - saved_isbns
new_isbns.each do |new_isbn|
self.where(isbn: new_isbn).create
end
end
end
def crawl_in_batches(now = nil)
now ||= Time.zone.now
self.expired(now).find_in_batches(batch_size: 5000).each do |batch|
isbn2rec = batch.index_by(&:isbn)
isbns = isbn2rec.keys.dup
res = get(isbns)
res.each do |data|
if data.present? && data['summary'].present?
isbn = data['summary']['isbn']
isbns -= [isbn]
rec = isbn2rec[isbn]
rec.update last_crawled_at: Time.zone.now, content: data
else
puts "Empty data!"
end
end
if isbns.present?
File.open('log/batch.log', 'a+') { |f|
f.puts "NOT LOADED ISBNS=#{isbns}"
}
end
end
end
def coverage
response = conn.get '/v1/coverage'
JSON.parse(response.body)
end
def get(isbns)
response = conn.post '/v1/get', isbn: isbns.join(',')
JSON.parse(response.body)
end
def conn
Faraday.new(url: 'https://api.openbd.jp')
end
end
end
end
@chsh
Copy link
Author

chsh commented Jan 25, 2017

usage

OpenBd.crawl!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment