Last active
July 3, 2016 14:13
-
-
Save igneus/71102387be3b7513182fdd9a75cfef09 to your computer and use it in GitHub Desktop.
Analyze incoming traffic from seznam.cz search in your Apache access log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# script analyzing incoming traffic from seznam.cz search | |
# in Apache access log | |
# Seznam search is "kind enough" to preserve the query in referer | |
# typical usage: | |
# grep search.seznam.cz access.log | seznam_searches.rb | |
# | |
# grepping the log isn't really necessary, it just spares the script | |
# parsing each line and makes it work only with the interesting | |
# stuff | |
require 'apache_log/parser' | |
require 'date' | |
require 'uri' | |
require 'cgi' | |
require 'set' | |
class SearchAnalyzer | |
def initialize | |
@queries = {} | |
@urls = {} | |
@ips = Set.new | |
@dates = {} | |
@records = 0 | |
@query_pairs = Set.new | |
end | |
def add_record(query:, url:, ip:, date:) | |
@records += 1 | |
@ips << ip | |
@dates[date] ||= 0 | |
@dates[date] += 1 | |
@queries[query] ||= 0 | |
@queries[query] += 1 | |
@urls[url] ||= 0 | |
@urls[url] += 1 | |
@query_pairs << [query, url] | |
end | |
def print_analysis | |
puts "Traffic from seznam.cz search:" | |
puts | |
puts "#{@records} search hits" | |
puts "#{@ips.size} unique IPs" | |
puts "on #{@dates.size} days (period #{start_date} - #{end_date})" | |
puts | |
puts "most searches: #{max_searches[1]} on #{max_searches[0]}" | |
puts "most visited URL: #{most_visited[0]} (#{most_visited[1]} times)" | |
puts "most searched: #{favorite_query[0]} (#{favorite_query[1]} times)" | |
puts | |
puts "All queries:" | |
@query_pairs.each do |pair| | |
puts "#{pair[0]} -> #{pair[1]}" | |
end | |
end | |
def start_date | |
@dates.keys.min | |
end | |
def end_date | |
@dates.keys.max | |
end | |
def max_searches | |
max_for(@dates) | |
end | |
def most_visited | |
max_for(@urls) | |
end | |
def favorite_query | |
max_for(@queries) | |
end | |
private | |
def max_for(counter) | |
counter.each_pair.max_by {|date,hits| hits } | |
end | |
end | |
analyzer = SearchAnalyzer.new | |
parser = ApacheLog::Parser.new('combined') | |
ARGF.each_line do |line| | |
parsed = parser.parse line | |
#p parsed | |
next unless parsed[:referer].start_with? 'http://search.seznam.cz' | |
uri = URI.parse parsed[:referer] | |
query = CGI.parse uri.query | |
#puts "#{query['q'][0]} => #{query['url'][0]}" | |
analyzer.add_record query: query['q'][0], url: parsed[:request][:path], ip: parsed[:remote_host], date: parsed[:datetime].to_date | |
end | |
analyzer.print_analysis |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment