Skip to content

Instantly share code, notes, and snippets.

@robmckinnon
Created March 25, 2011 22:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robmckinnon/887730 to your computer and use it in GitHub Desktop.
Save robmckinnon/887730 to your computer and use it in GitHub Desktop.
module Wikipedia
class Client
# see http://en.wikipedia.org/w/api.php
BASE_URL = "http://:domain/:path?action=:action&format=json"
attr_accessor :follow_redirects
def initialize
self.follow_redirects = true
end
def find title, options = {}
title = Url.new(title).title rescue title
page = Page.new request_page(title, options)
while follow_redirects and page.redirect?
page = Page.new request_page(page.redirect_title, options)
end
page
end
def find_pageid pageid, options = {}
pageid = Url.new(pageid).pageid rescue pageid
page = Page.new request_pageid(pageid, options)
while follow_redirects and page.redirect?
page = Page.new request_pageid(page.redirect_pageid, options)
end
page
end
def find_image title, options = {}
title = Url.new(title).title rescue title
Page.new request_image( title, options )
end
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Cimages&rvprop=content&pageids=435509
def request_pageid pageid, options = {}
request( {
:action => "query",
:prop => %w{ revisions links images categories },
:rvprop => "content",
:pageids => pageid
}.merge( options ) )
end
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
def request_page( title, options = {} )
request( {
:action => "query",
:prop => %w{ revisions links images categories },
:rvprop => "content",
:titles => title
}.merge( options ) )
end
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
def request_image( title, options = {} )
request( {
:action => "query",
:prop => "imageinfo",
:iiprop => "url",
:titles => title
}.merge( options ) )
end
def request( options )
require 'open-uri'
URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
end
protected
def configuration_options
{
:domain => Configuration[:domain],
:path => Configuration[:path]
}
end
def url_for( options )
url = BASE_URL.dup
options = configuration_options.merge( options )
options.each do |key, val|
value = urlify_value( val )
if url.include?( ":#{key}" )
url.sub! ":#{key}", value
else
url << "&#{key}=#{value}"
end
end
url
end
def urlify_value( val )
case val
when Array
encode( val.flatten.join( '|' ) )
else
encode( val )
end
end
def encode( val )
case val
when String
URI.encode( val ).gsub( '&', '%26' )
else
val
end
end
end
end
require 'singleton'
module Wikipedia
class Configuration
include Singleton
def self.directives(*directives)
directives.each do |directive|
define_method directive do |*args|
if args.empty?
return instance_variable_get("@#{directive}")
else
instance_variable_set("@#{directive}", args.first)
end
end
end
end
def self.[](directive)
instance.send(directive)
end
directives :domain, :path
end
end
require 'hpricot'
module Wikipedia
class Page
def initialize(json)
require 'json'
@json = json
@data = JSON::load(json)
end
def page
pages = @data['query']['pages']
if pages.respond_to?(:values)
pages.values.first
else
nil
end
end
def content
if page && page['revisions']
page['revisions'].first.values.first
else
nil
end
end
def sanitized_content
self.class.sanitize(content)
end
def external_website_uri
if content.nil?
nil
else
links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]\s(.+)/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], x[2]] }
more_links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], nil] }
links += more_links
site_links = links.select {|x| (x.last && x.last[/(web site|website|official (.*)site)/i]) || x[1][/(web site|website|official (.*)site)/i] }
site_links += content.scan(/[^\[](http:\/\/\S+)/).map {|x| [x.first.split('|').first.chomp(']').chomp('}}'), x.first, nil]}
site_links.delete_if {|x| x.first[/<\/ref>/] || x.first[/web.archive.org/] }
site_links
end
end
def logo_image
if content.nil?
nil
else
images = [
content[/(logo|image|image_name)\s*=\s*\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,3],
content[/(logo|image|image_name)\s*=\s*(.+\.(png|svg|gif|jpg|jpeg))/,2],
content[/\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,2]
].compact
if images.empty?
nil
else
images.detect {|x| x[/logo/i]} || images.first
end
end
end
def doc
Hpricot sanitized_content
end
def redirect?
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
end
def redirect_title
if matches = redirect?
matches[1]
end
end
def redirect_pageid
if matches = redirect?
matches[1]
end
end
def title
page['title']
end
def categories
if page['categories']
page['categories'].map {|c| c['title'] }
else
[]
end
end
def alt_categories
if content
cats = content.scan(/\[\[(Category:[^\]]+)\]\]/).map{|x| x.first.split('|').first }
if cats.empty?
categories
else
cats
end
else
categories
end
end
def links
page['links'].map {|c| c['title'] } if page['links']
end
def images
page['images'].map {|c| c['title'] } if page['images']
end
def thumbnail_url
page['imageinfo'].first['thumburl'] if page['imageinfo']
end
def thumbnail_height
page['imageinfo'].first['thumbheight'] if page['imageinfo']
end
def thumbnail_width
page['imageinfo'].first['thumbwidth'] if page['imageinfo']
end
def image_url
page['imageinfo'].first['url'] if page['imageinfo']
end
def description_url
page['imageinfo'].first['descriptionurl'] if page['imageinfo']
end
def image_urls
if list = images
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
filtered.map do |title|
Wikipedia.find_image( title ).image_url
end
end
end
def raw_data
@data
end
def json
@json
end
def self.sanitize( s )
if s
s = s.dup
# strip anything inside curly braces!
while s =~ /\{\{[^\{\}]+?\}\}/
s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
end
# strip info box
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
# strip internal links
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
# strip images and file links
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
# convert bold/italic to html
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
s.gsub!(/''(.+?)''/, '<i>\1</i>')
# misc
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
s.gsub!(/<!--[^>]+?-->/, '')
s.gsub!(' ', ' ')
s.strip!
# create paragraphs
sections = s.split("\n\n")
if sections.size > 1
s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
end
s
end
end
end
end
module Wikipedia
class Url
def initialize(wiki_url)
@wiki_url = wiki_url
end
def title
return @title if @title
uri = URI.parse( @wiki_url )
@title = URI.decode( uri.path.split('/').last )
end
def pageid
return @pageid if @pageid
uri = URI.parse( @wiki_url )
@pageid = URI.decode( uri.path.split('/').last )
end
end
end
require 'uri'
module Wikipedia
class << self
# Examples :
# page = Wikipedia.find('Rails')
# => #<Wikipedia:0x123102>
# page.content
# => wiki content appears here
def find page, options = {}
client.find page, options
end
def find_pageid pageid, options = {}
client.find_pageid pageid, options
end
def find_image title, options = {}
client.find_image title, options
end
def find_page_image page, height
logo_image = [nil, nil, nil, nil]
if page && page.logo_image
image = find_image("File:#{page.logo_image.gsub(' ','_')}", :iiurlheight => height, :iiurlwidth => '210')
if image.thumbnail_url
logo_image = [image.thumbnail_url, image.thumbnail_height, image.thumbnail_width, image.description_url]
elsif image.image_url
logo_image = [image.image_url, nil, nil, nil]
end
end
logo_image
end
def Configure &block
Configuration.instance.instance_eval(&block)
end
end
Configure {
domain 'en.wikipedia.org'
path 'w/api.php'
}
private
def self.client
@client ||= Wikipedia::Client.new
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment