Skip to content

Instantly share code, notes, and snippets.

@muffinista
Created June 14, 2013 12:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save muffinista/5781615 to your computer and use it in GitHub Desktop.
Save muffinista/5781615 to your computer and use it in GitHub Desktop.
Parsing wikimedia dumps
class Document < Nokogiri::XML::SAX::Document
attr_accessor :in_page, :in_title, :in_text, :text, :title, :depth, :root
def initialize
@depth = 4
@root = "/opt/wiki"
if File.exist?("/opt/wiki/redirects")
File.delete("/opt/wiki/redirects")
end
if File.exist?("/opt/wiki/titles")
File.delete("/opt/wiki/titles")
end
@redirects = File.new("/opt/wiki/redirects", "w")
@titles = File.new("/opt/wiki/titles", "w")
end
# <redirect title="Computer accessibility" />
def start_document
@in_page = false
@in_title = false
@in_text = false
end
def end_document
puts "the document has ended"
@redirects.close
@titles.close
end
def start_element name, attributes = []
if name == "page"
@in_page = true
@in_title = false
@in_text = false
@title = ""
@text = ""
@redirect = ""
#puts "#{name} started"
elsif name == "redirect" && @in_page == true
attrs = Hash[attributes]
@redirect = attrs['title']
elsif name == "title" && @in_page == true
@in_title = true
@in_text = false
elsif name == "text" && @in_page == true
@in_title = false
@in_text = true
end
end
def end_element name
if name == "title"
@in_title = false
elsif name == "page"
@in_page = false
puts @title
if @redirect != ""
if @title.downcase != @redirect.downcase
@redirects.puts "#{@title}\t#{@redirect}"
end
elsif @text != ""
if @title =~ /Help:/ ||
@title =~ /MediaWiki:/ ||
@title =~ /Book:/ ||
@title =~ /Portal:/ ||
@title =~ /Template:/ ||
@title =~ /Wikipedia:/ ||
@title =~ /File:/ ||
@title =~ /Category:/
else
@titles.puts @title
write_page_content(@title, @text)
end
end
elsif name == "text"
@in_text = false
end
end
def characters(str)
if @in_text == true
@text << str
elsif @in_title == true
@title << str
end
end
def write_page_content(title, page)
dest = path_for_title(title)
File.open(dest, 'w') {|f| f.write(page) }
end
def path_for_title(key)
md5 = Digest::MD5.hexdigest(key.to_s).to_s
dir = File.join(@root, md5.split(//)[-@depth, @depth])
FileUtils.mkdir_p(dir)
File.join(dir, md5)
end
end
#!/usr/bin/env ruby
require 'bundler/setup'
require 'nokogiri'
require './document'
parser = Nokogiri::XML::SAX::PushParser.new(::Document.new)
x = ARGF.read(8000)
while x && x.size > 0
parser << x
x = ARGF.read(8000)
end
parser.finish
#
# bzcat ~/Downloads/enwiki-20130102-pages-articles.xml.bz2| bundle exec ./parser.rb > titles.txt
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment