document.write('<link rel="stylesheet" href="http://gist.github.com/stylesheets/gist/embed.css"/>')





document.write('<div id=\"gist-102931\" class=\"gist\">\n  \n  \n    \n            \n\n      <div class=\"gist-file\">\n        <div class=\"gist-data gist-syntax\">\n          \n          \n          \n            <div class=\"gist-highlight\"><pre><div class=\"line\" id=\"LC1\"><span class=\"c1\">#! ruby -Ku<\/span><\/div><div class=\"line\" id=\"LC2\">&nbsp;<\/div><div class=\"line\" id=\"LC3\"><span class=\"nb\">require<\/span> <span class=\"s2\">&quot;rubygems&quot;<\/span><\/div><div class=\"line\" id=\"LC4\"><span class=\"nb\">require<\/span> <span class=\"s2\">&quot;nokogiri&quot;<\/span><\/div><div class=\"line\" id=\"LC5\"><span class=\"nb\">require<\/span> <span class=\"s1\">&#39;rexml/document&#39;<\/span><\/div><div class=\"line\" id=\"LC6\">&nbsp;<\/div><div class=\"line\" id=\"LC7\"><span class=\"k\">class<\/span> <span class=\"nc\">Doc<\/span> <span class=\"o\">&lt;<\/span> <span class=\"no\">Nokogiri<\/span><span class=\"o\">::<\/span><span class=\"no\">XML<\/span><span class=\"o\">::<\/span><span class=\"no\">SAX<\/span><span class=\"o\">::<\/span><span class=\"no\">Document<\/span><\/div><div class=\"line\" id=\"LC8\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">initialize<\/span><\/div><div class=\"line\" id=\"LC9\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@document<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC10\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC11\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@count<\/span> <span class=\"o\">=<\/span> <span class=\"mi\">0<\/span><\/div><div class=\"line\" id=\"LC12\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC13\">&nbsp;<\/div><div class=\"line\" id=\"LC14\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">start_element<\/span><span class=\"p\">(<\/span><span class=\"nb\">name<\/span><span class=\"p\">,<\/span> <span class=\"n\">attrs<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC15\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">if<\/span> <span class=\"nb\">name<\/span> <span class=\"o\">==<\/span> <span class=\"s2\">&quot;page&quot;<\/span><\/div><div class=\"line\" id=\"LC16\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@document<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC17\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span> <span class=\"o\">=<\/span> <span class=\"no\">StringIO<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span><\/div><div class=\"line\" id=\"LC18\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC19\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">nil<\/span> <span class=\"k\">if<\/span> <span class=\"n\">skip?<\/span><\/div><div class=\"line\" id=\"LC20\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span><span class=\"sx\">%|&lt;<\/span><span class=\"si\">#{<\/span><span class=\"nb\">name<\/span><span class=\"si\">}<\/span><span class=\"sx\">|)<\/span><\/div><div class=\"line\" id=\"LC21\"><span class=\"sx\">    Hash[ *attrs ].each do |pair|<\/span><\/div><div class=\"line\" id=\"LC22\"><span class=\"sx\">      @buf.write( &quot; <\/span><span class=\"si\">#{<\/span><span class=\"n\">pair<\/span><span class=\"o\">[<\/span><span class=\"mi\">0<\/span><span class=\"o\">]<\/span><span class=\"si\">}<\/span><span class=\"sx\">=\\&quot;<\/span><span class=\"si\">#{<\/span><span class=\"n\">pair<\/span><span class=\"o\">[<\/span><span class=\"mi\">1<\/span><span class=\"o\">]<\/span><span class=\"si\">}<\/span><span class=\"sx\">\\&quot;&quot;)<\/span><\/div><div class=\"line\" id=\"LC23\"><span class=\"sx\">    end<\/span><\/div><div class=\"line\" id=\"LC24\"><span class=\"sx\">    @buf.write(&quot;&gt;&quot;)<\/span><\/div><div class=\"line\" id=\"LC25\"><span class=\"sx\">  end<\/span><\/div><div class=\"line\" id=\"LC26\">&nbsp;<\/div><div class=\"line\" id=\"LC27\"><span class=\"sx\">  def characters(text)<\/span><\/div><div class=\"line\" id=\"LC28\"><span class=\"sx\">    return nil if skip?<\/span><\/div><div class=\"line\" id=\"LC29\"><span class=\"sx\">    @buf.write(REXML::Text::normalize(text))<\/span><\/div><div class=\"line\" id=\"LC30\"><span class=\"sx\">  end<\/span><\/div><div class=\"line\" id=\"LC31\">&nbsp;<\/div><div class=\"line\" id=\"LC32\"><span class=\"sx\">  def end_element(name)<\/span><\/div><div class=\"line\" id=\"LC33\"><span class=\"sx\">    return nil if skip?<\/span><\/div><div class=\"line\" id=\"LC34\"><span class=\"sx\">    @buf.write(%|&lt;/<\/span><span class=\"si\">#{<\/span><span class=\"nb\">name<\/span><span class=\"si\">}<\/span><span class=\"sx\">&gt;|)<\/span><\/div><div class=\"line\" id=\"LC35\"><span class=\"sx\">    if name == &quot;page&quot;<\/span><\/div><div class=\"line\" id=\"LC36\"><span class=\"sx\">      begin<\/span><\/div><div class=\"line\" id=\"LC37\"><span class=\"sx\">        proc_document(Nokogiri.XML(@buf.string))<\/span><\/div><div class=\"line\" id=\"LC38\"><span class=\"sx\">      rescue =&gt; e<\/span><\/div><div class=\"line\" id=\"LC39\"><span class=\"sx\">        puts e<\/span><\/div><div class=\"line\" id=\"LC40\"><span class=\"sx\">        exit<\/span><\/div><div class=\"line\" id=\"LC41\"><span class=\"sx\">      end<\/span><\/div><div class=\"line\" id=\"LC42\"><span class=\"sx\">      @buf.close<\/span><\/div><div class=\"line\" id=\"LC43\"><span class=\"sx\">    end<\/span><\/div><div class=\"line\" id=\"LC44\"><span class=\"sx\">  end<\/span><\/div><div class=\"line\" id=\"LC45\">&nbsp;<\/div><div class=\"line\" id=\"LC46\"><span class=\"sx\">  def proc_document(doc)<\/span><\/div><div class=\"line\" id=\"LC47\"><span class=\"sx\">    title = doc.at(:title).inner_text<\/span><\/div><div class=\"line\" id=\"LC48\"><span class=\"sx\">    text = doc.at(:text).inner_text<\/span><\/div><div class=\"line\" id=\"LC49\">&nbsp;<\/div><div class=\"line\" id=\"LC50\"><span class=\"sx\">    @count += 1<\/span><\/div><div class=\"line\" id=\"LC51\"><span class=\"sx\">    puts @count<\/span><\/div><div class=\"line\" id=\"LC52\"><span class=\"sx\">  end<\/span><\/div><div class=\"line\" id=\"LC53\">&nbsp;<\/div><div class=\"line\" id=\"LC54\"><span class=\"sx\">  def skip?<\/span><\/div><div class=\"line\" id=\"LC55\"><span class=\"sx\">    return true unless @buf<\/span><\/div><div class=\"line\" id=\"LC56\"><span class=\"sx\">    return true if @buf.closed?<\/span><\/div><div class=\"line\" id=\"LC57\"><span class=\"sx\">  end<\/span><\/div><div class=\"line\" id=\"LC58\"><span class=\"sx\">end<\/span><\/div><div class=\"line\" id=\"LC59\">&nbsp;<\/div><div class=\"line\" id=\"LC60\"><span class=\"sx\">File.open(&#39;./jawiki-latest-pages-articles.xml&#39;) { |f|<\/span><\/div><div class=\"line\" id=\"LC61\">&nbsp;&nbsp;<span class=\"no\">Nokogiri<\/span><span class=\"o\">::<\/span><span class=\"no\">XML<\/span><span class=\"o\">::<\/span><span class=\"no\">SAX<\/span><span class=\"o\">::<\/span><span class=\"no\">Parser<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span><span class=\"p\">(<\/span> <span class=\"no\">Doc<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span> <span class=\"p\">)<\/span><span class=\"o\">.<\/span><span class=\"n\">parse<\/span><span class=\"p\">(<\/span> <span class=\"n\">f<\/span> <span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC62\"><span class=\"p\">}<\/span><\/div><div class=\"line\" id=\"LC63\">&nbsp;<\/div><\/pre><\/div>\n          \n        <\/div>\n\n        <div class=\"gist-meta\">\n          <a href=\"http://gist.github.com/raw/102931/a758ffc4a156e91953908b4bc1f7d52c35a729ea/gistfile1.rb\" style=\"float:right;\">view raw<\/a>\n          <a href=\"http://gist.github.com/102931#file_gistfile1.rb\" style=\"float:right;margin-right:10px;color:#666\">gistfile1.rb<\/a>\n          <a href=\"http://gist.github.com/102931\">This Gist<\/a> brought to you by <a href=\"http://github.com\">GitHub<\/a>.\n        <\/div>\n      <\/div>\n    \n  \n<\/div>\n')
