document.write('<link rel="stylesheet" href="http://gist.github.com/stylesheets/gist/embed.css"/>')





document.write('<div id=\"gist-102931\" class=\"gist\">\n  \n  \n    \n            \n\n      <div class=\"gist-file\">\n        <div class=\"gist-data gist-syntax\">\n          \n          \n          \n            <div class=\"gist-highlight\"><pre><div class=\"line\" id=\"LC1\"><span class=\"c1\">#! ruby -Ku<\/span><\/div><div class=\"line\" id=\"LC2\">&nbsp;<\/div><div class=\"line\" id=\"LC3\"><span class=\"nb\">require<\/span> <span class=\"s2\">&quot;rubygems&quot;<\/span><\/div><div class=\"line\" id=\"LC4\"><span class=\"nb\">require<\/span> <span class=\"s2\">&quot;nokogiri&quot;<\/span><\/div><div class=\"line\" id=\"LC5\"><span class=\"nb\">require<\/span> <span class=\"s1\">&#39;rexml/document&#39;<\/span><\/div><div class=\"line\" id=\"LC6\">&nbsp;<\/div><div class=\"line\" id=\"LC7\"><span class=\"k\">class<\/span> <span class=\"nc\">Doc<\/span> <span class=\"o\">&lt;<\/span> <span class=\"no\">Nokogiri<\/span><span class=\"o\">::<\/span><span class=\"no\">XML<\/span><span class=\"o\">::<\/span><span class=\"no\">SAX<\/span><span class=\"o\">::<\/span><span class=\"no\">Document<\/span><\/div><div class=\"line\" id=\"LC8\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">initialize<\/span><\/div><div class=\"line\" id=\"LC9\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@document<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC10\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC11\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@count<\/span> <span class=\"o\">=<\/span> <span class=\"mi\">0<\/span><\/div><div class=\"line\" id=\"LC12\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC13\">&nbsp;<\/div><div class=\"line\" id=\"LC14\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">start_element<\/span><span class=\"p\">(<\/span><span class=\"nb\">name<\/span><span class=\"p\">,<\/span> <span class=\"n\">attrs<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC15\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">if<\/span> <span class=\"nb\">name<\/span> <span class=\"o\">==<\/span> <span class=\"s2\">&quot;page&quot;<\/span><\/div><div class=\"line\" id=\"LC16\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@document<\/span> <span class=\"o\">=<\/span> <span class=\"kp\">nil<\/span><\/div><div class=\"line\" id=\"LC17\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span> <span class=\"o\">=<\/span> <span class=\"no\">StringIO<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span><\/div><div class=\"line\" id=\"LC18\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC19\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">nil<\/span> <span class=\"k\">if<\/span> <span class=\"n\">skip?<\/span><\/div><div class=\"line\" id=\"LC20\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span><span class=\"sx\">%|&lt;<\/span><span class=\"si\">#{<\/span><span class=\"nb\">name<\/span><span class=\"si\">}<\/span><span class=\"sx\">|<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC21\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"no\">Hash<\/span><span class=\"o\">[<\/span> <span class=\"o\">*<\/span><span class=\"n\">attrs<\/span> <span class=\"o\">].<\/span><span class=\"n\">each<\/span> <span class=\"k\">do<\/span> <span class=\"o\">|<\/span><span class=\"n\">pair<\/span><span class=\"o\">|<\/span><\/div><div class=\"line\" id=\"LC22\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span> <span class=\"s2\">&quot; <\/span><span class=\"si\">#{<\/span><span class=\"n\">pair<\/span><span class=\"o\">[<\/span><span class=\"mi\">0<\/span><span class=\"o\">]<\/span><span class=\"si\">}<\/span><span class=\"s2\">=<\/span><span class=\"se\">\\&quot;<\/span><span class=\"si\">#{<\/span><span class=\"n\">pair<\/span><span class=\"o\">[<\/span><span class=\"mi\">1<\/span><span class=\"o\">]<\/span><span class=\"si\">}<\/span><span class=\"se\">\\&quot;<\/span><span class=\"s2\">&quot;<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC23\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC24\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span><span class=\"s2\">&quot;&gt;&quot;<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC25\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC26\">&nbsp;<\/div><div class=\"line\" id=\"LC27\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">characters<\/span><span class=\"p\">(<\/span><span class=\"n\">text<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC28\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">nil<\/span> <span class=\"k\">if<\/span> <span class=\"n\">skip?<\/span><\/div><div class=\"line\" id=\"LC29\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span><span class=\"no\">REXML<\/span><span class=\"o\">::<\/span><span class=\"no\">Text<\/span><span class=\"o\">::<\/span><span class=\"n\">normalize<\/span><span class=\"p\">(<\/span><span class=\"n\">text<\/span><span class=\"p\">))<\/span><\/div><div class=\"line\" id=\"LC30\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC31\">&nbsp;<\/div><div class=\"line\" id=\"LC32\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">end_element<\/span><span class=\"p\">(<\/span><span class=\"nb\">name<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC33\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">nil<\/span> <span class=\"k\">if<\/span> <span class=\"n\">skip?<\/span><\/div><div class=\"line\" id=\"LC34\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">write<\/span><span class=\"p\">(<\/span><span class=\"sx\">%|&lt;/<\/span><span class=\"si\">#{<\/span><span class=\"nb\">name<\/span><span class=\"si\">}<\/span><span class=\"sx\">&gt;|<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC35\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">if<\/span> <span class=\"nb\">name<\/span> <span class=\"o\">==<\/span> <span class=\"s2\">&quot;page&quot;<\/span><\/div><div class=\"line\" id=\"LC36\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">begin<\/span><\/div><div class=\"line\" id=\"LC37\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"n\">proc_document<\/span><span class=\"p\">(<\/span><span class=\"no\">Nokogiri<\/span><span class=\"o\">.<\/span><span class=\"n\">XML<\/span><span class=\"p\">(<\/span><span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">string<\/span><span class=\"p\">))<\/span><\/div><div class=\"line\" id=\"LC38\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">rescue<\/span> <span class=\"o\">=&gt;<\/span> <span class=\"n\">e<\/span><\/div><div class=\"line\" id=\"LC39\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"nb\">puts<\/span> <span class=\"n\">e<\/span><\/div><div class=\"line\" id=\"LC40\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"nb\">exit<\/span><\/div><div class=\"line\" id=\"LC41\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC42\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">close<\/span><\/div><div class=\"line\" id=\"LC43\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC44\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC45\">&nbsp;<\/div><div class=\"line\" id=\"LC46\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">proc_document<\/span><span class=\"p\">(<\/span><span class=\"n\">doc<\/span><span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC47\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"n\">title<\/span> <span class=\"o\">=<\/span> <span class=\"n\">doc<\/span><span class=\"o\">.<\/span><span class=\"n\">at<\/span><span class=\"p\">(<\/span><span class=\"ss\">:title<\/span><span class=\"p\">)<\/span><span class=\"o\">.<\/span><span class=\"n\">inner_text<\/span><\/div><div class=\"line\" id=\"LC48\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"n\">text<\/span> <span class=\"o\">=<\/span> <span class=\"n\">doc<\/span><span class=\"o\">.<\/span><span class=\"n\">at<\/span><span class=\"p\">(<\/span><span class=\"ss\">:text<\/span><span class=\"p\">)<\/span><span class=\"o\">.<\/span><span class=\"n\">inner_text<\/span><\/div><div class=\"line\" id=\"LC49\">&nbsp;<\/div><div class=\"line\" id=\"LC50\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"vi\">@count<\/span> <span class=\"o\">+=<\/span> <span class=\"mi\">1<\/span><\/div><div class=\"line\" id=\"LC51\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"nb\">puts<\/span> <span class=\"vi\">@count<\/span><\/div><div class=\"line\" id=\"LC52\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC53\">&nbsp;<\/div><div class=\"line\" id=\"LC54\">&nbsp;&nbsp;<span class=\"k\">def<\/span> <span class=\"nf\">skip?<\/span><\/div><div class=\"line\" id=\"LC55\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">true<\/span> <span class=\"k\">unless<\/span> <span class=\"vi\">@buf<\/span><\/div><div class=\"line\" id=\"LC56\">&nbsp;&nbsp;&nbsp;&nbsp;<span class=\"k\">return<\/span> <span class=\"kp\">true<\/span> <span class=\"k\">if<\/span> <span class=\"vi\">@buf<\/span><span class=\"o\">.<\/span><span class=\"n\">closed?<\/span><\/div><div class=\"line\" id=\"LC57\">&nbsp;&nbsp;<span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC58\"><span class=\"k\">end<\/span><\/div><div class=\"line\" id=\"LC59\">&nbsp;<\/div><div class=\"line\" id=\"LC60\"><span class=\"no\">File<\/span><span class=\"o\">.<\/span><span class=\"n\">open<\/span><span class=\"p\">(<\/span><span class=\"s1\">&#39;./jawiki-latest-pages-articles.xml&#39;<\/span><span class=\"p\">)<\/span> <span class=\"p\">{<\/span> <span class=\"o\">|<\/span><span class=\"n\">f<\/span><span class=\"o\">|<\/span><\/div><div class=\"line\" id=\"LC61\">&nbsp;&nbsp;<span class=\"no\">Nokogiri<\/span><span class=\"o\">::<\/span><span class=\"no\">XML<\/span><span class=\"o\">::<\/span><span class=\"no\">SAX<\/span><span class=\"o\">::<\/span><span class=\"no\">Parser<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span><span class=\"p\">(<\/span> <span class=\"no\">Doc<\/span><span class=\"o\">.<\/span><span class=\"n\">new<\/span> <span class=\"p\">)<\/span><span class=\"o\">.<\/span><span class=\"n\">parse<\/span><span class=\"p\">(<\/span> <span class=\"n\">f<\/span> <span class=\"p\">)<\/span><\/div><div class=\"line\" id=\"LC62\"><span class=\"p\">}<\/span><\/div><div class=\"line\" id=\"LC63\">&nbsp;<\/div><\/pre><\/div>\n          \n        <\/div>\n\n        <div class=\"gist-meta\">\n          <a href=\"http://gist.github.com/raw/102931/a758ffc4a156e91953908b4bc1f7d52c35a729ea/gistfile1.rb\" style=\"float:right;\">view raw<\/a>\n          <a href=\"http://gist.github.com/102931#file_gistfile1.rb\" style=\"float:right;margin-right:10px;color:#666\">gistfile1.rb<\/a>\n          <a href=\"http://gist.github.com/102931\">This Gist<\/a> brought to you by <a href=\"http://github.com\">GitHub<\/a>.\n        <\/div>\n      <\/div>\n    \n  \n<\/div>\n')
