Created
March 14, 2013 15:08
-
-
Save anonymous/5162114 to your computer and use it in GitHub Desktop.
Analyze access logs to find the accesses by "t.co"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8 -*- | |
# ruby hateref.rb [Hatena log file ...] | |
# Available with Ruby 1.9/2.0/2.1 | |
require "csv" | |
require "time" | |
require "kconv" | |
class Hateref | |
def initialize | |
@ref = Hash.new # url_ref => [url_ref, url_vis, count, first_date] | |
end | |
def read_log(filename) | |
basename = File.basename(filename).sub(/\.csv(\.gz)?$/, "") | |
puts "basename: #{basename}" if $DEBUG | |
linenum = 0 | |
command_pre = (/\.gz$/ =~ filename) ? "|zcat " : "" | |
open(command_pre + filename) do |f_in| | |
# Example of line: "2013-02-27 22:38:22,***.***.***.***,http://t.co/8KwYui0Re7,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.97 Safari/537.22","ja,en-US;q=0.8,en;q=0.6",1366x768,32,http://d.hatena.ne.jp/takehikom/20121105/1352042888" | |
csv_in = CSV.new(f_in.read.toutf8) | |
csv_in.each do |param| | |
linenum += 1 | |
date = param[0] | |
url_ref = param[2] | |
url_vis = param[7] | |
if $DEBUG | |
puts "line no.#{linenum}" | |
puts "date: #{date}" | |
puts "from: #{url_ref}" | |
puts " to: #{url_vis}" | |
# puts "\t" + param.join("\n\t") | |
# break if linenum >= 3 | |
end | |
next if url_ref.nil? | |
# If you comment out the following code, then | |
# you will face a flood of output. | |
next unless url_ref.index("http://t.co/") | |
key = url_ref | |
# key = [url_ref, url_vis].join(" ") | |
if @ref.key?(key) | |
@ref[key][2] += 1 | |
@ref[key][3] += date if date < @ref[key][3] | |
else | |
@ref[key] = [url_ref, url_vis, 1, date] | |
end | |
end | |
end | |
end | |
def print1 | |
# Print records separated by blank line | |
@ref.keys.sort_by {|key| @ref[key][2]}.reverse.each do |key| | |
url_ref, url_vis, count, date = @ref[key] | |
puts " from: #{url_ref}" | |
puts " to: #{url_vis}" | |
puts "first: #{date}" | |
puts "count: #{count}" | |
puts | |
end | |
end | |
def print2 | |
# Print a record per line | |
@ref.keys.sort_by {|key| @ref[key][0]}.each do |key| | |
url_ref, url_vis, count, date = @ref[key] | |
puts [url_ref, url_vis, date].to_csv | |
end | |
end | |
end | |
if __FILE__ == $0 | |
if ARGV.empty? | |
puts "usage: ruby hateref.rb csv_file ..." | |
exit | |
end | |
hr = Hateref.new | |
ARGV.each do |filename| | |
hr.read_log(filename) | |
end | |
hr.print1 | |
hr.print2 | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment