Created
March 14, 2017 09:44
-
-
Save igaiga/28842248107cb19733b252fb44219826 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Analyze Wikipedia access data | |
# https://dumps.wikimedia.org/other/pageviews/ | |
# Data format | |
# https://dumps.wikimedia.org/other/pagecounts-raw/ | |
# "ja "はじまりのものだけ(=Wikipediaだけ)をカウントしている。 | |
# ja.X の説明は上記のData format参照。 | |
# 取得したサンプルデータ | |
# "en Tokyo_Station 3 0\n" | |
# "ja 名古屋駅 3 0\n" | |
# "ja 大阪駅 4 0\n" | |
# "ja 東京駅 1 0\n" | |
# {:title=>"大阪駅", :count=>"4"} | |
# {:title=>"名古屋駅", :count=>"3"} | |
# {:title=>"東京駅", :count=>"1"} | |
filename = "pageviews-20170101-000000" | |
#filename = "pageviews-20170203-200000" | |
file = File.open(filename, "r:UTF-8") | |
access_data = [] | |
file.each_line do |text| | |
next unless text =~ /^en / | |
data = text.split | |
next unless data[1] =~ /Tokyo_Station/ | |
#next unless data[1] =~ /Station\z/ # 検索された駅名ランキング | |
#next unless ['東京駅', '名古屋駅', '大阪駅'].include?(data[1]) # 3駅のみのサンプルコード | |
p text # データファイル取得用コード | |
h = {title: data[1], count: data[2]} | |
access_data.push h | |
end | |
file.close | |
# count順にソート | |
result = access_data.sort_by do |i| | |
i[:count].to_i | |
end | |
# トップ20表示 | |
result.reverse.first(20).each do |i| | |
puts i | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment