Skip to content

Instantly share code, notes, and snippets.

@keymon
Created January 15, 2012 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save keymon/1614315 to your computer and use it in GitHub Desktop.
Save keymon/1614315 to your computer and use it in GitHub Desktop.
This script reads apache log files entries from the standard input, parses them and stores them in a mongo database.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This script reads apache log files entries from the standard input, parses them
# and stores them in a mongo database.
#
# Requires:
# - pymongo
# - apachelog: http://code.google.com/p/apachelog
#
import sys
import apachelog
import pymongo
from datetime import datetime
# Mongo DB config
MONGO_HOST='localhost'
MONGO_PORT=27017
MONGO_DB_NAME='logging_db'
MONGO_COLLECTION_NAME='access_log_collection'
MONGO_COLLECTION_SIZE=1*1024*1024 # 1MB
# Access Log format. Combined.
ACCESS_LOG_FORMAT = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
# Aliases for apache log parser
ACCESS_ALIAS = {
'%>s': 'status',
'%b': 'size',
'%h': 'client_ip',
'%r': 'request',
'%t': 'time',
'%{Referer}i': 'referer',
'%{User-Agent}i': 'user_agent' }
# Configure alias
def apachelog_parser_alias(self, name):
if ACCESS_ALIAS.has_key(name): return ACCESS_ALIAS[name]
else: return name
apachelog.parser.alias = apachelog_parser_alias
# Initialize mongo
def do_mongo_init():
# Connect allowing slave
connection = pymongo.Connection(MONGO_HOST, MONGO_PORT, slave_okay=True)
db = connection[MONGO_DB_NAME]
# Mongo will create the collection automacly, BUT we want to set
# capped collection to enable "log rotation".
# If you try to create a collection that already exists, it thows
# pymongo.CollectionInvalid
try:
collection = \
db.create_collection(MONGO_COLLECTION_NAME,
capped = True,
size = MONGO_COLLECTION_SIZE)
print "New collection created"
except pymongo.errors.CollectionInvalid:
# Already exists, get it.
collection = db[MONGO_DB_NAME]
return (connection, db, collection)
def main():
(mongo_connection, mongo_db, mongo_collection) = do_mongo_init()
parser = apachelog.parser(ACCESS_LOG_FORMAT)
# Process each line from stdin
while True:
logline = sys.stdin.readline()
if logline == '': break
try:
data = parser.parse(logline)
# Here you can process data to remove some items or similar.
# Date processing. Important to save it as datetime to allow date queries
(timestamp_str, timezone_str) = data['time'][1:-1].split(' ')
data['timestamp'] = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S")
data['timezone'] = int(timezone_str)
# I split the request
(data['method'], data['url'], data['proto']) = data['request'].split(' ')
# store the data in the Mongo DB
mongo_collection.insert(data)
except apachelog.ApacheLogParserError:
# ignore invalid lines
print >> sys.stderr, 'Invalid log format: "' + logline + '"'
mongo_connection.disconnect()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment