Created
March 25, 2011 22:07
-
-
Save robmckinnon/887730 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Wikipedia | |
class Client | |
# see http://en.wikipedia.org/w/api.php | |
BASE_URL = "http://:domain/:path?action=:action&format=json" | |
attr_accessor :follow_redirects | |
def initialize | |
self.follow_redirects = true | |
end | |
def find title, options = {} | |
title = Url.new(title).title rescue title | |
page = Page.new request_page(title, options) | |
while follow_redirects and page.redirect? | |
page = Page.new request_page(page.redirect_title, options) | |
end | |
page | |
end | |
def find_pageid pageid, options = {} | |
pageid = Url.new(pageid).pageid rescue pageid | |
page = Page.new request_pageid(pageid, options) | |
while follow_redirects and page.redirect? | |
page = Page.new request_pageid(page.redirect_pageid, options) | |
end | |
page | |
end | |
def find_image title, options = {} | |
title = Url.new(title).title rescue title | |
Page.new request_image( title, options ) | |
end | |
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Cimages&rvprop=content&pageids=435509 | |
def request_pageid pageid, options = {} | |
request( { | |
:action => "query", | |
:prop => %w{ revisions links images categories }, | |
:rvprop => "content", | |
:pageids => pageid | |
}.merge( options ) ) | |
end | |
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game) | |
def request_page( title, options = {} ) | |
request( { | |
:action => "query", | |
:prop => %w{ revisions links images categories }, | |
:rvprop => "content", | |
:titles => title | |
}.merge( options ) ) | |
end | |
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png | |
def request_image( title, options = {} ) | |
request( { | |
:action => "query", | |
:prop => "imageinfo", | |
:iiprop => "url", | |
:titles => title | |
}.merge( options ) ) | |
end | |
def request( options ) | |
require 'open-uri' | |
URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" ) | |
end | |
protected | |
def configuration_options | |
{ | |
:domain => Configuration[:domain], | |
:path => Configuration[:path] | |
} | |
end | |
def url_for( options ) | |
url = BASE_URL.dup | |
options = configuration_options.merge( options ) | |
options.each do |key, val| | |
value = urlify_value( val ) | |
if url.include?( ":#{key}" ) | |
url.sub! ":#{key}", value | |
else | |
url << "&#{key}=#{value}" | |
end | |
end | |
url | |
end | |
def urlify_value( val ) | |
case val | |
when Array | |
encode( val.flatten.join( '|' ) ) | |
else | |
encode( val ) | |
end | |
end | |
def encode( val ) | |
case val | |
when String | |
URI.encode( val ).gsub( '&', '%26' ) | |
else | |
val | |
end | |
end | |
end | |
end | |
require 'singleton' | |
module Wikipedia | |
class Configuration | |
include Singleton | |
def self.directives(*directives) | |
directives.each do |directive| | |
define_method directive do |*args| | |
if args.empty? | |
return instance_variable_get("@#{directive}") | |
else | |
instance_variable_set("@#{directive}", args.first) | |
end | |
end | |
end | |
end | |
def self.[](directive) | |
instance.send(directive) | |
end | |
directives :domain, :path | |
end | |
end | |
require 'hpricot' | |
module Wikipedia | |
class Page | |
def initialize(json) | |
require 'json' | |
@json = json | |
@data = JSON::load(json) | |
end | |
def page | |
pages = @data['query']['pages'] | |
if pages.respond_to?(:values) | |
pages.values.first | |
else | |
nil | |
end | |
end | |
def content | |
if page && page['revisions'] | |
page['revisions'].first.values.first | |
else | |
nil | |
end | |
end | |
def sanitized_content | |
self.class.sanitize(content) | |
end | |
def external_website_uri | |
if content.nil? | |
nil | |
else | |
links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]\s(.+)/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], x[2]] } | |
more_links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], nil] } | |
links += more_links | |
site_links = links.select {|x| (x.last && x.last[/(web site|website|official (.*)site)/i]) || x[1][/(web site|website|official (.*)site)/i] } | |
site_links += content.scan(/[^\[](http:\/\/\S+)/).map {|x| [x.first.split('|').first.chomp(']').chomp('}}'), x.first, nil]} | |
site_links.delete_if {|x| x.first[/<\/ref>/] || x.first[/web.archive.org/] } | |
site_links | |
end | |
end | |
def logo_image | |
if content.nil? | |
nil | |
else | |
images = [ | |
content[/(logo|image|image_name)\s*=\s*\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,3], | |
content[/(logo|image|image_name)\s*=\s*(.+\.(png|svg|gif|jpg|jpeg))/,2], | |
content[/\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,2] | |
].compact | |
if images.empty? | |
nil | |
else | |
images.detect {|x| x[/logo/i]} || images.first | |
end | |
end | |
end | |
def doc | |
Hpricot sanitized_content | |
end | |
def redirect? | |
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i) | |
end | |
def redirect_title | |
if matches = redirect? | |
matches[1] | |
end | |
end | |
def redirect_pageid | |
if matches = redirect? | |
matches[1] | |
end | |
end | |
def title | |
page['title'] | |
end | |
def categories | |
if page['categories'] | |
page['categories'].map {|c| c['title'] } | |
else | |
[] | |
end | |
end | |
def alt_categories | |
if content | |
cats = content.scan(/\[\[(Category:[^\]]+)\]\]/).map{|x| x.first.split('|').first } | |
if cats.empty? | |
categories | |
else | |
cats | |
end | |
else | |
categories | |
end | |
end | |
def links | |
page['links'].map {|c| c['title'] } if page['links'] | |
end | |
def images | |
page['images'].map {|c| c['title'] } if page['images'] | |
end | |
def thumbnail_url | |
page['imageinfo'].first['thumburl'] if page['imageinfo'] | |
end | |
def thumbnail_height | |
page['imageinfo'].first['thumbheight'] if page['imageinfo'] | |
end | |
def thumbnail_width | |
page['imageinfo'].first['thumbwidth'] if page['imageinfo'] | |
end | |
def image_url | |
page['imageinfo'].first['url'] if page['imageinfo'] | |
end | |
def description_url | |
page['imageinfo'].first['descriptionurl'] if page['imageinfo'] | |
end | |
def image_urls | |
if list = images | |
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") } | |
filtered.map do |title| | |
Wikipedia.find_image( title ).image_url | |
end | |
end | |
end | |
def raw_data | |
@data | |
end | |
def json | |
@json | |
end | |
def self.sanitize( s ) | |
if s | |
s = s.dup | |
# strip anything inside curly braces! | |
while s =~ /\{\{[^\{\}]+?\}\}/ | |
s.gsub!(/\{\{[^\{\}]+?\}\}/, '') | |
end | |
# strip info box | |
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '') | |
# strip internal links | |
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2') | |
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1') | |
# strip images and file links | |
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '') | |
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '') | |
# convert bold/italic to html | |
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>') | |
s.gsub!(/'''(.+?)'''/, '<b>\1</b>') | |
s.gsub!(/''(.+?)''/, '<i>\1</i>') | |
# misc | |
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '') | |
s.gsub!(/<!--[^>]+?-->/, '') | |
s.gsub!(' ', ' ') | |
s.strip! | |
# create paragraphs | |
sections = s.split("\n\n") | |
if sections.size > 1 | |
s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n") | |
end | |
s | |
end | |
end | |
end | |
end | |
module Wikipedia | |
class Url | |
def initialize(wiki_url) | |
@wiki_url = wiki_url | |
end | |
def title | |
return @title if @title | |
uri = URI.parse( @wiki_url ) | |
@title = URI.decode( uri.path.split('/').last ) | |
end | |
def pageid | |
return @pageid if @pageid | |
uri = URI.parse( @wiki_url ) | |
@pageid = URI.decode( uri.path.split('/').last ) | |
end | |
end | |
end | |
require 'uri' | |
module Wikipedia | |
class << self | |
# Examples : | |
# page = Wikipedia.find('Rails') | |
# => #<Wikipedia:0x123102> | |
# page.content | |
# => wiki content appears here | |
def find page, options = {} | |
client.find page, options | |
end | |
def find_pageid pageid, options = {} | |
client.find_pageid pageid, options | |
end | |
def find_image title, options = {} | |
client.find_image title, options | |
end | |
def find_page_image page, height | |
logo_image = [nil, nil, nil, nil] | |
if page && page.logo_image | |
image = find_image("File:#{page.logo_image.gsub(' ','_')}", :iiurlheight => height, :iiurlwidth => '210') | |
if image.thumbnail_url | |
logo_image = [image.thumbnail_url, image.thumbnail_height, image.thumbnail_width, image.description_url] | |
elsif image.image_url | |
logo_image = [image.image_url, nil, nil, nil] | |
end | |
end | |
logo_image | |
end | |
def Configure &block | |
Configuration.instance.instance_eval(&block) | |
end | |
end | |
Configure { | |
domain 'en.wikipedia.org' | |
path 'w/api.php' | |
} | |
private | |
def self.client | |
@client ||= Wikipedia::Client.new | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment