Created
June 14, 2013 12:59
-
-
Save muffinista/5781615 to your computer and use it in GitHub Desktop.
Parsing wikimedia dumps
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Document < Nokogiri::XML::SAX::Document | |
attr_accessor :in_page, :in_title, :in_text, :text, :title, :depth, :root | |
def initialize | |
@depth = 4 | |
@root = "/opt/wiki" | |
if File.exist?("/opt/wiki/redirects") | |
File.delete("/opt/wiki/redirects") | |
end | |
if File.exist?("/opt/wiki/titles") | |
File.delete("/opt/wiki/titles") | |
end | |
@redirects = File.new("/opt/wiki/redirects", "w") | |
@titles = File.new("/opt/wiki/titles", "w") | |
end | |
# <redirect title="Computer accessibility" /> | |
def start_document | |
@in_page = false | |
@in_title = false | |
@in_text = false | |
end | |
def end_document | |
puts "the document has ended" | |
@redirects.close | |
@titles.close | |
end | |
def start_element name, attributes = [] | |
if name == "page" | |
@in_page = true | |
@in_title = false | |
@in_text = false | |
@title = "" | |
@text = "" | |
@redirect = "" | |
#puts "#{name} started" | |
elsif name == "redirect" && @in_page == true | |
attrs = Hash[attributes] | |
@redirect = attrs['title'] | |
elsif name == "title" && @in_page == true | |
@in_title = true | |
@in_text = false | |
elsif name == "text" && @in_page == true | |
@in_title = false | |
@in_text = true | |
end | |
end | |
def end_element name | |
if name == "title" | |
@in_title = false | |
elsif name == "page" | |
@in_page = false | |
puts @title | |
if @redirect != "" | |
if @title.downcase != @redirect.downcase | |
@redirects.puts "#{@title}\t#{@redirect}" | |
end | |
elsif @text != "" | |
if @title =~ /Help:/ || | |
@title =~ /MediaWiki:/ || | |
@title =~ /Book:/ || | |
@title =~ /Portal:/ || | |
@title =~ /Template:/ || | |
@title =~ /Wikipedia:/ || | |
@title =~ /File:/ || | |
@title =~ /Category:/ | |
else | |
@titles.puts @title | |
write_page_content(@title, @text) | |
end | |
end | |
elsif name == "text" | |
@in_text = false | |
end | |
end | |
def characters(str) | |
if @in_text == true | |
@text << str | |
elsif @in_title == true | |
@title << str | |
end | |
end | |
def write_page_content(title, page) | |
dest = path_for_title(title) | |
File.open(dest, 'w') {|f| f.write(page) } | |
end | |
def path_for_title(key) | |
md5 = Digest::MD5.hexdigest(key.to_s).to_s | |
dir = File.join(@root, md5.split(//)[-@depth, @depth]) | |
FileUtils.mkdir_p(dir) | |
File.join(dir, md5) | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'bundler/setup' | |
require 'nokogiri' | |
require './document' | |
parser = Nokogiri::XML::SAX::PushParser.new(::Document.new) | |
x = ARGF.read(8000) | |
while x && x.size > 0 | |
parser << x | |
x = ARGF.read(8000) | |
end | |
parser.finish | |
# | |
# bzcat ~/Downloads/enwiki-20130102-pages-articles.xml.bz2| bundle exec ./parser.rb > titles.txt | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment