robmckinnon/gist:887730

## gistfile1.txt
module Wikipedia
  class Client
    # see http://en.wikipedia.org/w/api.php
    BASE_URL = "http://:domain/:path?action=:action&format=json"

    attr_accessor :follow_redirects

    def initialize
      self.follow_redirects = true
    end

    def find title, options = {}
      title = Url.new(title).title rescue title
      page = Page.new request_page(title, options)
      while follow_redirects and page.redirect?
        page = Page.new request_page(page.redirect_title, options)
      end
      page
    end

    def find_pageid pageid, options = {}
      pageid = Url.new(pageid).pageid rescue pageid
      page = Page.new request_pageid(pageid, options)
      while follow_redirects and page.redirect?
        page = Page.new request_pageid(page.redirect_pageid, options)
      end
      page
    end

    def find_image title, options = {}
      title = Url.new(title).title rescue title
      Page.new request_image( title, options )
    end

    # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Cimages&rvprop=content&pageids=435509
    def request_pageid pageid, options = {}
      request( {
                 :action => "query",
                 :prop => %w{ revisions links images categories },
                 :rvprop => "content",
                 :pageids => pageid
               }.merge( options ) )
    end

    # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
    def request_page( title, options = {} )
      request( {
                 :action => "query",
                 :prop => %w{ revisions links images categories },
                 :rvprop => "content",
                 :titles => title
               }.merge( options ) )
    end

    # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
    def request_image( title, options = {} )
      request( {
                 :action => "query",
                 :prop => "imageinfo",
                 :iiprop => "url",
                 :titles => title
               }.merge( options ) )
    end

    def request( options )
      require 'open-uri'
      URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
    end

    protected
      def configuration_options
        {
          :domain => Configuration[:domain],
          :path   => Configuration[:path]
        }
      end

      def url_for( options )
        url = BASE_URL.dup
        options = configuration_options.merge( options )
        options.each do |key, val|
          value = urlify_value( val )
          if url.include?( ":#{key}" )
            url.sub! ":#{key}", value
          else
            url << "&#{key}=#{value}"
          end
        end
        url
      end

      def urlify_value( val )
        case val
        when Array
          encode( val.flatten.join( '|' ) )
        else
          encode( val )
        end
      end

      def encode( val )
        case val
        when String
          URI.encode( val ).gsub( '&', '%26' )
        else
          val
        end
      end
  end
end

require 'singleton'

module Wikipedia
  class Configuration
    include Singleton

    def self.directives(*directives)
      directives.each do |directive|
        define_method directive do |*args|
          if args.empty?
            return instance_variable_get("@#{directive}")
          else
            instance_variable_set("@#{directive}", args.first)
          end
        end
      end
    end

    def self.[](directive)
      instance.send(directive)
    end

    directives :domain, :path
  end
end

require 'hpricot'

module Wikipedia
  class Page
    def initialize(json)
      require 'json'
      @json = json
      @data = JSON::load(json)
    end

    def page
      pages = @data['query']['pages']
      if pages.respond_to?(:values)
        pages.values.first
      else
        nil
      end
    end

    def content
      if page && page['revisions']
        page['revisions'].first.values.first
      else
        nil
      end
    end

    def sanitized_content
      self.class.sanitize(content)
    end

    def external_website_uri
      if content.nil?
        nil
      else
        links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]\s(.+)/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], x[2]] }
        more_links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]/).map {|x| [x.first.chomp(']').chomp('}}'), x[1], nil] }
        links += more_links

        site_links = links.select {|x| (x.last && x.last[/(web site|website|official (.*)site)/i]) || x[1][/(web site|website|official (.*)site)/i] }

        site_links += content.scan(/[^\[](http:\/\/\S+)/).map {|x| [x.first.split('|').first.chomp(']').chomp('}}'), x.first, nil]}

        site_links.delete_if {|x| x.first[/<\/ref>/] || x.first[/web.archive.org/] }
        site_links
      end
    end

    def logo_image
      if content.nil?
        nil
      else
        images = [
          content[/(logo|image|image_name)\s*=\s*\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,3],
          content[/(logo|image|image_name)\s*=\s*(.+\.(png|svg|gif|jpg|jpeg))/,2],
          content[/\[\[(Image|File):(.+\.(png|svg|gif|jpg|jpeg))/,2]
        ].compact
        if images.empty?
          nil
        else
          images.detect {|x| x[/logo/i]} || images.first
        end
      end
    end

    def doc
      Hpricot sanitized_content
    end

    def redirect?
      content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
    end

    def redirect_title
      if matches = redirect?
        matches[1]
      end
    end

    def redirect_pageid
      if matches = redirect?
        matches[1]
      end
    end

    def title
      page['title']
    end

    def categories
      if page['categories']
        page['categories'].map {|c| c['title'] }
      else
        []
      end
    end

    def alt_categories
      if content
        cats = content.scan(/\[\[(Category:[^\]]+)\]\]/).map{|x| x.first.split('|').first }
        if cats.empty?
          categories
        else
          cats
        end
      else
        categories
      end
    end

    def links
      page['links'].map {|c| c['title'] } if page['links']
    end

    def images
      page['images'].map {|c| c['title'] } if page['images']
    end

    def thumbnail_url
      page['imageinfo'].first['thumburl'] if page['imageinfo']
    end

    def thumbnail_height
      page['imageinfo'].first['thumbheight'] if page['imageinfo']
    end

    def thumbnail_width
      page['imageinfo'].first['thumbwidth'] if page['imageinfo']
    end

    def image_url
      page['imageinfo'].first['url'] if page['imageinfo']
    end

    def description_url
      page['imageinfo'].first['descriptionurl'] if page['imageinfo']
    end

    def image_urls
      if list = images
        filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
        filtered.map do |title|
          Wikipedia.find_image( title ).image_url
        end
      end
    end

    def raw_data
      @data
    end

    def json
      @json
    end

    def self.sanitize( s )
      if s
        s = s.dup

        # strip anything inside curly braces!
        while s =~ /\{\{[^\{\}]+?\}\}/
          s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
        end

        # strip info box
        s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')

        # strip internal links
        s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
        s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')

        # strip images and file links
        s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
        s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')

        # convert bold/italic to html
        s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
        s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
        s.gsub!(/''(.+?)''/, '<i>\1</i>')

        # misc
        s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
        s.gsub!(/<!--[^>]+?-->/, '')
        s.gsub!('  ', ' ')
        s.strip!

        # create paragraphs
        sections = s.split("\n\n")
        if sections.size > 1
          s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
        end

        s
      end
    end
  end
end

module Wikipedia
  class Url
    def initialize(wiki_url)
      @wiki_url = wiki_url
    end

    def title
      return @title if @title
      uri     = URI.parse( @wiki_url )
      @title  = URI.decode( uri.path.split('/').last )
    end

    def pageid
      return @pageid if @pageid
      uri     = URI.parse( @wiki_url )
      @pageid  = URI.decode( uri.path.split('/').last )
    end
  end
end

require 'uri'

module Wikipedia

  class << self

    # Examples :
    # page = Wikipedia.find('Rails')
    # => #<Wikipedia:0x123102>
    # page.content
    # => wiki content appears here
    def find page, options = {}
      client.find page, options
    end

    def find_pageid pageid, options = {}
      client.find_pageid pageid, options
    end

    def find_image title, options = {}
      client.find_image title, options
    end

    def find_page_image page, height
      logo_image = [nil, nil, nil, nil]
      if page && page.logo_image
        image = find_image("File:#{page.logo_image.gsub(' ','_')}", :iiurlheight => height, :iiurlwidth => '210')
        if image.thumbnail_url
          logo_image = [image.thumbnail_url, image.thumbnail_height, image.thumbnail_width, image.description_url]
        elsif image.image_url
          logo_image = [image.image_url, nil, nil, nil]
        end
      end
      logo_image
    end

    def Configure &block
      Configuration.instance.instance_eval(&block)
    end
  end

  Configure {
    domain 'en.wikipedia.org'
    path   'w/api.php'
  }

  private

  def self.client
    @client ||= Wikipedia::Client.new
  end
end
	module Wikipedia
	class Client
	# see http://en.wikipedia.org/w/api.php
	BASE_URL = "http://:domain/:path?action=:action&format=json"

	attr_accessor :follow_redirects

	def initialize
	self.follow_redirects = true
	end

	def find title, options = {}
	title = Url.new(title).title rescue title
	page = Page.new request_page(title, options)
	while follow_redirects and page.redirect?
	page = Page.new request_page(page.redirect_title, options)
	end
	page
	end

	def find_pageid pageid, options = {}
	pageid = Url.new(pageid).pageid rescue pageid
	page = Page.new request_pageid(pageid, options)
	while follow_redirects and page.redirect?
	page = Page.new request_pageid(page.redirect_pageid, options)
	end
	page
	end

	def find_image title, options = {}
	title = Url.new(title).title rescue title
	Page.new request_image( title, options )
	end

	# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Cimages&rvprop=content&pageids=435509
	def request_pageid pageid, options = {}
	request( {
	:action => "query",
	:prop => %w{ revisions links images categories },
	:rvprop => "content",
	:pageids => pageid
	}.merge( options ) )
	end

	# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
	def request_page( title, options = {} )
	request( {
	:action => "query",
	:prop => %w{ revisions links images categories },
	:rvprop => "content",
	:titles => title
	}.merge( options ) )
	end

	# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
	def request_image( title, options = {} )
	request( {
	:action => "query",
	:prop => "imageinfo",
	:iiprop => "url",
	:titles => title
	}.merge( options ) )
	end

	def request( options )
	require 'open-uri'
	URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
	end

	protected
	def configuration_options
	{
	:domain => Configuration[:domain],
	:path => Configuration[:path]
	}
	end

	def url_for( options )
	url = BASE_URL.dup
	options = configuration_options.merge( options )
	options.each do \|key, val\|
	value = urlify_value( val )
	if url.include?( ":#{key}" )
	url.sub! ":#{key}", value
	else
	url << "&#{key}=#{value}"
	end
	end
	url
	end

	def urlify_value( val )
	case val
	when Array
	encode( val.flatten.join( '\|' ) )
	else
	encode( val )
	end
	end

	def encode( val )
	case val
	when String
	URI.encode( val ).gsub( '&', '%26' )
	else
	val
	end
	end
	end
	end

	require 'singleton'

	module Wikipedia
	class Configuration
	include Singleton

	def self.directives(*directives)
	directives.each do \|directive\|
	define_method directive do \|*args\|
	if args.empty?
	return instance_variable_get("@#{directive}")
	else
	instance_variable_set("@#{directive}", args.first)
	end
	end
	end
	end

	def self.[](directive)
	instance.send(directive)
	end

	directives :domain, :path
	end
	end

	require 'hpricot'

	module Wikipedia
	class Page
	def initialize(json)
	require 'json'
	@json = json
	@data = JSON::load(json)
	end

	def page
	pages = @data['query']['pages']
	if pages.respond_to?(:values)
	pages.values.first
	else
	nil
	end
	end

	def content
	if page && page['revisions']
	page['revisions'].first.values.first
	else
	nil
	end
	end

	def sanitized_content
	self.class.sanitize(content)
	end

	def external_website_uri
	if content.nil?
	nil
	else
	links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]\s(.+)/).map {\|x\| [x.first.chomp(']').chomp('}}'), x[1], x[2]] }
	more_links = content.scan(/\[(http:\/\/\S+)\s+([^\]]+)\]/).map {\|x\| [x.first.chomp(']').chomp('}}'), x[1], nil] }
	links += more_links

	site_links = links.select {\|x\| (x.last && x.last[/(web site\|website\|official (.)site)/i]) \|\| x[1][/(web site\|website\|official (.)site)/i] }

	site_links += content.scan(/[^\[](http:\/\/\S+)/).map {\|x\| [x.first.split('\|').first.chomp(']').chomp('}}'), x.first, nil]}

	site_links.delete_if {\|x\| x.first[/<\/ref>/] \|\| x.first[/web.archive.org/] }
	site_links
	end
	end

	def logo_image
	if content.nil?
	nil
	else
	images = [
	content[/(logo\|image\|image_name)\s=\s\[\[(Image\|File):(.+\.(png\|svg\|gif\|jpg\|jpeg))/,3],
	content[/(logo\|image\|image_name)\s=\s(.+\.(png\|svg\|gif\|jpg\|jpeg))/,2],
	content[/\[\[(Image\|File):(.+\.(png\|svg\|gif\|jpg\|jpeg))/,2]
	].compact
	if images.empty?
	nil
	else
	images.detect {\|x\| x[/logo/i]} \|\| images.first
	end
	end
	end

	def doc
	Hpricot sanitized_content
	end

	def redirect?
	content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
	end

	def redirect_title
	if matches = redirect?
	matches[1]
	end
	end

	def redirect_pageid
	if matches = redirect?
	matches[1]
	end
	end

	def title
	page['title']
	end

	def categories
	if page['categories']
	page['categories'].map {\|c\| c['title'] }
	else
	[]
	end
	end

	def alt_categories
	if content
	cats = content.scan(/\[\[(Category:[^\]]+)\]\]/).map{\|x\| x.first.split('\|').first }
	if cats.empty?
	categories
	else
	cats
	end
	else
	categories
	end
	end

	def links
	page['links'].map {\|c\| c['title'] } if page['links']
	end

	def images
	page['images'].map {\|c\| c['title'] } if page['images']
	end

	def thumbnail_url
	page['imageinfo'].first['thumburl'] if page['imageinfo']
	end

	def thumbnail_height
	page['imageinfo'].first['thumbheight'] if page['imageinfo']
	end

	def thumbnail_width
	page['imageinfo'].first['thumbwidth'] if page['imageinfo']
	end

	def image_url
	page['imageinfo'].first['url'] if page['imageinfo']
	end

	def description_url
	page['imageinfo'].first['descriptionurl'] if page['imageinfo']
	end

	def image_urls
	if list = images
	filtered = list.select {\|i\| i =~ /^file:.+\.(jpg\|jpeg\|png\|gif)$/i && !i.include?("LinkFA-star") }
	filtered.map do \|title\|
	Wikipedia.find_image( title ).image_url
	end
	end
	end

	def raw_data
	@data
	end

	def json
	@json
	end

	def self.sanitize( s )
	if s
	s = s.dup

	# strip anything inside curly braces!
	while s =~ /\{\{[^\{\}]+?\}\}/
	s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
	end

	# strip info box
	s.sub!(/^\{\\|[^\{\}]+?\n\\|\}\n/, '')

	# strip internal links
	s.gsub!(/\[\[([^\]\\|]+?)\\|([^\]\\|]+?)\]\]/, '\2')
	s.gsub!(/\[\[([^\]\\|]+?)\]\]/, '\1')

	# strip images and file links
	s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
	s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')

	# convert bold/italic to html
	s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
	s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
	s.gsub!(/''(.+?)''/, '<i>\1</i>')

	# misc
	s.gsub!(/<ref[^<>]>[\s\S]?<\/ref>/, '')
	s.gsub!(/<!--[^>]+?-->/, '')
	s.gsub!(' ', ' ')
	s.strip!

	# create paragraphs
	sections = s.split("\n\n")
	if sections.size > 1
	s = sections.map {\|s\| "<p>#{s.strip}</p>" }.join("\n")
	end

	s
	end
	end
	end
	end

	module Wikipedia
	class Url
	def initialize(wiki_url)
	@wiki_url = wiki_url
	end

	def title
	return @title if @title
	uri = URI.parse( @wiki_url )
	@title = URI.decode( uri.path.split('/').last )
	end

	def pageid
	return @pageid if @pageid
	uri = URI.parse( @wiki_url )
	@pageid = URI.decode( uri.path.split('/').last )
	end
	end
	end

	require 'uri'

	module Wikipedia

	class << self

	# Examples :
	# page = Wikipedia.find('Rails')
	# => #<Wikipedia:0x123102>
	# page.content
	# => wiki content appears here
	def find page, options = {}
	client.find page, options
	end

	def find_pageid pageid, options = {}
	client.find_pageid pageid, options
	end

	def find_image title, options = {}
	client.find_image title, options
	end

	def find_page_image page, height
	logo_image = [nil, nil, nil, nil]
	if page && page.logo_image
	image = find_image("File:#{page.logo_image.gsub(' ','_')}", :iiurlheight => height, :iiurlwidth => '210')
	if image.thumbnail_url
	logo_image = [image.thumbnail_url, image.thumbnail_height, image.thumbnail_width, image.description_url]
	elsif image.image_url
	logo_image = [image.image_url, nil, nil, nil]
	end
	end
	logo_image
	end

	def Configure &block
	Configuration.instance.instance_eval(&block)
	end
	end

	Configure {
	domain 'en.wikipedia.org'
	path 'w/api.php'
	}

	private

	def self.client
	@client \|\|= Wikipedia::Client.new
	end
	end