Skip to content

Instantly share code, notes, and snippets.

@delineas
Forked from luisparravicini/isbn_search.rb
Created October 25, 2021 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save delineas/f496563ebdd7071d7c11f8eb3b41b5b5 to your computer and use it in GitHub Desktop.
Save delineas/f496563ebdd7071d7c11f8eb3b41b5b5 to your computer and use it in GitHub Desktop.
Busca la informacion de un libro segun su isbn
#!/usr/bin/ruby1.9.1
# Script para traducir isbn a informacion del libro buscando en varias
# librerias online. Se probo por ultima vez el 2009-12-11.
#
# Author:: Luis Parravicini
# Copyright:: Copyright (C) 2009 Luis Parravicini
# License:: GPL v2
#
# http://ktulu.com.ar/blog/2009/12/11/traduciendo-isbn/
#
require 'mechanize'
require 'activesupport'
require 'set'
require 'fileutils'
# Fuente de datos que obtiene informacion de un libro a partir del isbn.
class Source
def initialize(agent)
@agent = agent
end
def name
self.class.name.downcase
end
def search(isbn)
data = custom_search(isbn)
unless data.nil? || empty?(data)
data.each do |k, v|
unless data[k].nil?
data[k].strip!
data[k].gsub(/\s+/, ' ')
end
end
data[:isbn] = isbn
data
end
end
def empty?(book)
values = book.select { |k, v| k != :isbn }.values.compact
values.empty?
end
end
class CasaDelLibro < Source
def custom_search(isbn)
page = @agent.get("http://www.casadellibro.com/busquedas/quickResults?tbusq=i&buscar=#{isbn}")
result = page.search("div.fBusqueda")
return if result.nil?
data = {}
img = result.at('div.imagen img.fbus')
img = img['src'] unless img.nil?
data[:cover] = img
book = result.at('div.datosLibro')
unless book.nil?
data[:title] = book.at('.tit').try(:inner_text)
data[:authors] = book.at('.pAutores').try(:inner_text)
data[:authors].gsub!(/^\s*de\s+/, '') unless data[:authors].nil?
house = book.at('.pEditorial')
unless house.nil?
data[:editorial] = house.inner_text
#TODO next_sibling no deberia skipear \n ?
e = house
while e = e.next_sibling
break if e.name == 'p'
end
data[:edition] = e.try(:inner_text)
end
end
data
end
end
class Lsf < Source
def custom_search(isbn)
page = @agent.get("http://www.lsf.com.ar/m/ficha.aspx?codigo=#{isbn}")
book = {}
data = page
img = data.at('img#ctl00_ContentPlaceHolder1_img_tapa')
img = img['src'] unless img.nil?
return if img == '#'
book[:cover] = img unless img =~ /nodisponible\.gif$/
data = data.search('div.vistaMaxInfo')
unless data.nil?
book[:title] = data.at('div.vistaMaxInfoTit').try(:inner_text)
book[:authors] = data.at('div.autores').try(:inner_text)
book[:house] = data.at('#ctl00_ContentPlaceHolder1_div_editorial').try(:inner_text)
book[:house].gsub!(/^\s*Editorial\s*:\s*/i, '') unless book[:house].nil?
end
book
end
end
class ALibris < Source
def custom_search(isbn)
page = @agent.get("http://www.alibris.com/search/books/isbn/#{isbn}")
book = {}
img = page.at('img.cvr')
img = img['src'] unless img.nil?
book[:cover] = img unless img =~ /no_image\.gif$/
data = page.at('div#box-1col-merch-isbn')
unless data.nil?
book[:house] = data.at('div.publisher-details').try(:inner_text)
data = data.at('div.author-title')
unless data.nil?
book[:authors] = data.at('a').try(:inner_text)
book[:title] = data.at('h2').try(:inner_text)
edition = data.at('p').try(:inner_text)
book[:edition] = if edition =~ /^\(.+\)$/
edition[1..-2]
else
edition
end
end
end
book
end
end
class Amazon < Source
def custom_search(isbn)
page = @agent.get("http://www.amazon.com/s/ref=nb_ss?url=search-alias%3Dstripbooks&field-keywords=#{isbn}&x=0&y=0")
book = {}
data = page.at('div#result_0')
return if data.nil?
img = data.at('div.productImage img')
img = img['src'] unless img.nil?
book[:cover] = img unless img =~ /no-img-sm/
data = data.at('div.productTitle')
unless data.nil?
author = data.at('span.ptBrand').try(:inner_text)
unless author.nil?
author.gsub!(/^\s*by\s+/, '')
book[:authors] = author
end
book[:title] = data.at('a').try(:inner_text)
book[:edition] = data.at('span.format').try(:inner_text)
end
book
end
end
def unmarshal(fname)
if File.exists?(fname)
File.open(fname) { |f| Marshal::load(f) }
else
{}
end
end
def marshal(fname, data)
tmp = fname + ".tmp"
File.open(tmp, 'w') { |f| Marshal::dump(data, f) }
FileUtils.rm(fname) if File.exists?(fname)
FileUtils.mv tmp, fname
end
# Genera un csv con informacion de un libro por cada isbn leido.
# no quotea nada de la info del libro, asume que esta no va a romper el
# csv.
def dump_csv(stream)
while isbn = stream.gets
break if isbn.nil?
isbn.strip!
book = $books[isbn]
data = [isbn]
unless book.nil?
data += [book[:title], book[:authors], book[:house], book[:edition]]
end
puts data.map { |x| "\"#{x}\"" }.join(",")
end
end
# Obtiene informacion del libro a partir del isbn. Cuando una fuente de
# datos no conoce cierto isbn, se lo guarda en un cache negativo con un
# ttl de NEGATIVE_TTL
def fetch_book(isbn)
book = nil
now = Time.now
$strategies.sort_by { rand }.each do |s|
print '.'
neg = $negative[isbn]
if neg.nil? || neg[s.name].nil? || neg[s.name]+NEGATIVE_TTL < now
book = s.search(isbn)
unless book.nil?
book[:sources] ||= Set.new
book[:sources].add(s.name)
$books[isbn] = book
break
else
$negative[isbn] ||= {}
$negative[isbn][s.name] = Time.now
end
end
end
puts
book
end
NEGATIVE_TTL = 20.day
NEGATIVE = 'negative.cache'
BOOKS = 'books'
$negative = unmarshal(NEGATIVE)
$books = unmarshal(BOOKS)
agent = WWW::Mechanize.new
agent.user_agent_alias = 'Windows IE 7'
$strategies = [
Lsf.new(agent),
CasaDelLibro.new(agent),
ALibris.new(agent),
Amazon.new(agent),
]
csv = ARGV.shift == '--csv'
if csv
dump_csv($stdin)
exit
end
while true do
print '> '
isbn = $stdin.gets
break if isbn.nil?
isbn.strip!
next if isbn.empty?
#TODO si size < 10 deberia completar con 0
if isbn.size != 13 && isbn.size != 10
puts "no tiene longitud de isbn"
next
end
isbn.upcase! # algunos isbn terminan en 'x'
if $books.has_key?(isbn)
book = $books[isbn]
else
book = fetch_book(isbn)
end
puts book
marshal(BOOKS, $books)
marshal(NEGATIVE, $negative)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment