-
-
Save delineas/f496563ebdd7071d7c11f8eb3b41b5b5 to your computer and use it in GitHub Desktop.
Busca la informacion de un libro segun su isbn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby1.9.1 | |
# Script para traducir isbn a informacion del libro buscando en varias | |
# librerias online. Se probo por ultima vez el 2009-12-11. | |
# | |
# Author:: Luis Parravicini | |
# Copyright:: Copyright (C) 2009 Luis Parravicini | |
# License:: GPL v2 | |
# | |
# http://ktulu.com.ar/blog/2009/12/11/traduciendo-isbn/ | |
# | |
require 'mechanize' | |
require 'activesupport' | |
require 'set' | |
require 'fileutils' | |
# Fuente de datos que obtiene informacion de un libro a partir del isbn. | |
class Source | |
def initialize(agent) | |
@agent = agent | |
end | |
def name | |
self.class.name.downcase | |
end | |
def search(isbn) | |
data = custom_search(isbn) | |
unless data.nil? || empty?(data) | |
data.each do |k, v| | |
unless data[k].nil? | |
data[k].strip! | |
data[k].gsub(/\s+/, ' ') | |
end | |
end | |
data[:isbn] = isbn | |
data | |
end | |
end | |
def empty?(book) | |
values = book.select { |k, v| k != :isbn }.values.compact | |
values.empty? | |
end | |
end | |
class CasaDelLibro < Source | |
def custom_search(isbn) | |
page = @agent.get("http://www.casadellibro.com/busquedas/quickResults?tbusq=i&buscar=#{isbn}") | |
result = page.search("div.fBusqueda") | |
return if result.nil? | |
data = {} | |
img = result.at('div.imagen img.fbus') | |
img = img['src'] unless img.nil? | |
data[:cover] = img | |
book = result.at('div.datosLibro') | |
unless book.nil? | |
data[:title] = book.at('.tit').try(:inner_text) | |
data[:authors] = book.at('.pAutores').try(:inner_text) | |
data[:authors].gsub!(/^\s*de\s+/, '') unless data[:authors].nil? | |
house = book.at('.pEditorial') | |
unless house.nil? | |
data[:editorial] = house.inner_text | |
#TODO next_sibling no deberia skipear \n ? | |
e = house | |
while e = e.next_sibling | |
break if e.name == 'p' | |
end | |
data[:edition] = e.try(:inner_text) | |
end | |
end | |
data | |
end | |
end | |
class Lsf < Source | |
def custom_search(isbn) | |
page = @agent.get("http://www.lsf.com.ar/m/ficha.aspx?codigo=#{isbn}") | |
book = {} | |
data = page | |
img = data.at('img#ctl00_ContentPlaceHolder1_img_tapa') | |
img = img['src'] unless img.nil? | |
return if img == '#' | |
book[:cover] = img unless img =~ /nodisponible\.gif$/ | |
data = data.search('div.vistaMaxInfo') | |
unless data.nil? | |
book[:title] = data.at('div.vistaMaxInfoTit').try(:inner_text) | |
book[:authors] = data.at('div.autores').try(:inner_text) | |
book[:house] = data.at('#ctl00_ContentPlaceHolder1_div_editorial').try(:inner_text) | |
book[:house].gsub!(/^\s*Editorial\s*:\s*/i, '') unless book[:house].nil? | |
end | |
book | |
end | |
end | |
class ALibris < Source | |
def custom_search(isbn) | |
page = @agent.get("http://www.alibris.com/search/books/isbn/#{isbn}") | |
book = {} | |
img = page.at('img.cvr') | |
img = img['src'] unless img.nil? | |
book[:cover] = img unless img =~ /no_image\.gif$/ | |
data = page.at('div#box-1col-merch-isbn') | |
unless data.nil? | |
book[:house] = data.at('div.publisher-details').try(:inner_text) | |
data = data.at('div.author-title') | |
unless data.nil? | |
book[:authors] = data.at('a').try(:inner_text) | |
book[:title] = data.at('h2').try(:inner_text) | |
edition = data.at('p').try(:inner_text) | |
book[:edition] = if edition =~ /^\(.+\)$/ | |
edition[1..-2] | |
else | |
edition | |
end | |
end | |
end | |
book | |
end | |
end | |
class Amazon < Source | |
def custom_search(isbn) | |
page = @agent.get("http://www.amazon.com/s/ref=nb_ss?url=search-alias%3Dstripbooks&field-keywords=#{isbn}&x=0&y=0") | |
book = {} | |
data = page.at('div#result_0') | |
return if data.nil? | |
img = data.at('div.productImage img') | |
img = img['src'] unless img.nil? | |
book[:cover] = img unless img =~ /no-img-sm/ | |
data = data.at('div.productTitle') | |
unless data.nil? | |
author = data.at('span.ptBrand').try(:inner_text) | |
unless author.nil? | |
author.gsub!(/^\s*by\s+/, '') | |
book[:authors] = author | |
end | |
book[:title] = data.at('a').try(:inner_text) | |
book[:edition] = data.at('span.format').try(:inner_text) | |
end | |
book | |
end | |
end | |
def unmarshal(fname) | |
if File.exists?(fname) | |
File.open(fname) { |f| Marshal::load(f) } | |
else | |
{} | |
end | |
end | |
def marshal(fname, data) | |
tmp = fname + ".tmp" | |
File.open(tmp, 'w') { |f| Marshal::dump(data, f) } | |
FileUtils.rm(fname) if File.exists?(fname) | |
FileUtils.mv tmp, fname | |
end | |
# Genera un csv con informacion de un libro por cada isbn leido. | |
# no quotea nada de la info del libro, asume que esta no va a romper el | |
# csv. | |
def dump_csv(stream) | |
while isbn = stream.gets | |
break if isbn.nil? | |
isbn.strip! | |
book = $books[isbn] | |
data = [isbn] | |
unless book.nil? | |
data += [book[:title], book[:authors], book[:house], book[:edition]] | |
end | |
puts data.map { |x| "\"#{x}\"" }.join(",") | |
end | |
end | |
# Obtiene informacion del libro a partir del isbn. Cuando una fuente de | |
# datos no conoce cierto isbn, se lo guarda en un cache negativo con un | |
# ttl de NEGATIVE_TTL | |
def fetch_book(isbn) | |
book = nil | |
now = Time.now | |
$strategies.sort_by { rand }.each do |s| | |
print '.' | |
neg = $negative[isbn] | |
if neg.nil? || neg[s.name].nil? || neg[s.name]+NEGATIVE_TTL < now | |
book = s.search(isbn) | |
unless book.nil? | |
book[:sources] ||= Set.new | |
book[:sources].add(s.name) | |
$books[isbn] = book | |
break | |
else | |
$negative[isbn] ||= {} | |
$negative[isbn][s.name] = Time.now | |
end | |
end | |
end | |
puts | |
book | |
end | |
NEGATIVE_TTL = 20.day | |
NEGATIVE = 'negative.cache' | |
BOOKS = 'books' | |
$negative = unmarshal(NEGATIVE) | |
$books = unmarshal(BOOKS) | |
agent = WWW::Mechanize.new | |
agent.user_agent_alias = 'Windows IE 7' | |
$strategies = [ | |
Lsf.new(agent), | |
CasaDelLibro.new(agent), | |
ALibris.new(agent), | |
Amazon.new(agent), | |
] | |
csv = ARGV.shift == '--csv' | |
if csv | |
dump_csv($stdin) | |
exit | |
end | |
while true do | |
print '> ' | |
isbn = $stdin.gets | |
break if isbn.nil? | |
isbn.strip! | |
next if isbn.empty? | |
#TODO si size < 10 deberia completar con 0 | |
if isbn.size != 13 && isbn.size != 10 | |
puts "no tiene longitud de isbn" | |
next | |
end | |
isbn.upcase! # algunos isbn terminan en 'x' | |
if $books.has_key?(isbn) | |
book = $books[isbn] | |
else | |
book = fetch_book(isbn) | |
end | |
puts book | |
marshal(BOOKS, $books) | |
marshal(NEGATIVE, $negative) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment