Skip to content

Instantly share code, notes, and snippets.

@levinalex
Created November 6, 2009 19:37
Show Gist options
  • Save levinalex/228213 to your computer and use it in GitHub Desktop.
Save levinalex/228213 to your computer and use it in GitHub Desktop.
wikipedia2git.rb
require 'httparty'
require 'grit'
require 'pathname'
module Wikipedia
class Revision
def initialize(article, revision)
@article = article
@revision = revision
end
%w(timestamp user revid anon comment).each do |m|
define_method(m) do
@revision[m]
end
end
def content
@revision["*"]
end
end
class Revisions
include HTTParty
include Enumerable
base_uri "http://en.wikipedia.org/w/api.php"
default_params :action => :query,
:prop => :revisions,
:rvdir => :newer,
:rvprop => [:content, :timestamp, :user, :revid, :comment].join("|"),
:rvlimit => 50,
:format => :json
def initialize(article)
@article = article
@pageid = article.pageid
@revs = []
end
def each
nextquery = {}
while true
result = self.class.get('', :query => {:pageids => @pageid}.merge(nextquery))
@revs = result["query"]["pages"][@pageid]["revisions"].each do |rev|
yield Revision.new(@article, rev)
end
if result["query-continue"]
nextquery = result["query-continue"]["revisions"]
else
return
end
end
sleep 1
end
end
class Article
include HTTParty
base_uri "http://en.wikipedia.org/w/api.php"
def initialize(name)
@name = name
end
def pageid
@pageid ||= begin
result = self.class.get('',
:query => {
:action => :query ,
:prop => :revisions,
:titles => @name,
:format => :json })
result["query"]["pages"].keys.first
end
end
def revisions
Revisions.new(self)
end
end
end
if __FILE__ == $0
article = Wikipedia::Article.new(ARGV[0] || "Main Page")
id = article.pageid
dir = Pathname.new("cache").join(article.pageid)
FileUtils.mkdir_p(dir)
repo = Grit::Repo.init_bare(dir.join("#{article.pageid}.git"))
last_commit = nil
article.revisions.each do |r|
puts [r.timestamp, r.user].join(" - ")
user = Grit::Actor.new(r.user, r.user)
ENV["GIT_AUTHOR_DATE"] = r.timestamp.to_s
index = repo.index
index.add("article", r.content)
last_commit = index.commit(r.comment, last_commit, user)
end
p last_commit
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment