Skip to content

Instantly share code, notes, and snippets.

@fidelisrafael
Last active November 17, 2019 08:45
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fidelisrafael/db05790cd71db962b2da44e1d68a2981 to your computer and use it in GitHub Desktop.
Save fidelisrafael/db05790cd71db962b2da44e1d68a2981 to your computer and use it in GitHub Desktop.
Nginx Logger Parser for Ruby
require 'pry'
require 'json'
require 'uri'
module Application
class NginxLogParser
DEFAULT_FORMAT_REGEXP = /(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s?\-\s?-\s?\[(\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\]\s?\\?"?(GET|POST|PUT|HEAD|DELETE|OPTIONS)\s?(.*?)\s(HTTP\/\d\.\d)\\?"?\s?(\d{3})\s?(\d+)\s?\\?\"\-\\?\"\s?\\?\"(.*?)\"/i
REQUEST_FORMAT = [
:ip_address,
:date,
:symbol,
:request_method,
:request_path,
:http_version,
:response_status,
:body_size,
:user_agent
]
attr_reader :log_gile, :current_line, :percent_read, :total_lines
def initialize(log_file, regexp = nil)
@log_file = File.open(log_file)
@total_lines = File.open(log_file).readlines.size
@percent_read = 0
@regexp = regexp || DEFAULT_FORMAT_REGEXP
end
def parse
while line_data = readline
parsed_line = parse_line_to_object(line_data)
yield(parsed_line) if block_given?
end
end
def parse_matching(regexp, field = :request_path)
parsed_results = []
parse do |parsed_line|
field_value = parsed_line[field]
next unless field_value
if matches = field_value.match(regexp)
yield(parsed_line, matches) if block_given?
end
end
end
private
def readline
return nil if @log_file.eof?
@current_line = $.
@percent_read = ((@current_line * 100)/total_lines) # $. is the current line in file reading
return @log_file.readline
end
def parse_line_to_object(line)
matches = line.match(@regexp)
data = matches ? matches[1, matches.size] : []
Hash[REQUEST_FORMAT.zip(data)]
end
end
end
require_relative 'application_nginx_log_parser'
start_time = Time.now
RESULT_FILE = 'nginx.search.results.json'
SEARCH_REGEXP = /search\/questions\?(.*?)((q\=(.*?)\&))/i
def parse_file
match_results = []
# Withou second parameter the default log format of nginx will be assumed
parser = Application::NginxLogParser.new('nginx.access.log')
# It's possible to use a custom regexp to read line-by-line
# parser = Application::NginxLogParser.new('nginx.access.log', /(.*)/)
# parse and find for lines matching an regexp
parser.parse_matching(SEARCH_REGEXP) do |parsed_line, matches|
print "\r#{parser.current_line}/#{parser.total_lines} = #{parser.percent_read}%"
match_results << { search: matches[-1] }.merge(parsed_line)
end
# Generate well formated results file for each line of JSON
File.open(RESULT_FILE , 'wb') do |f|
f.write(JSON.generate(match_results))
end
end
def skip_log_file_parse?
ARGV.include?('-s') || ARGV.include?('--skip-log-parse')
end
parse_file unless skip_log_file_parse?
# group data by term
data = JSON.parse(File.read(RESULT_FILE))
grouped_data = data.group_by {|d| URI.decode(d["search"]) }
total_by_term = grouped_data.map {|data| { search: data[0].gsub(/\+/, ' '), total: data[1].size } }
sorted = total_by_term.sort_by {|data| data[:total] }
# well formated results of search by term
File.open('total_searches.json', 'wb') do |f|
total_searches = sorted.inject(0) {|total, data| total += data[:total] }
json = JSON.pretty_generate({ total_searches: total_searches, report: sorted.reverse })
f.write(json)
end
end_time = Time.now
runtime = (end_time - start_time)
puts "\nExecuted in %s seconds" % runtime
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment