Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
This script reads apache log files entries from the standard input, parses them and stores them in a mongo database.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This script reads apache log files entries from the standard input, parses them
# and stores them in a mongo database.
#
# Requires:
# - pymongo
# - apachelog: http://code.google.com/p/apachelog
#
import sys
import apachelog
import pymongo
from datetime import datetime
# Mongo DB config
MONGO_HOST='localhost'
MONGO_PORT=27017
MONGO_DB_NAME='logging_db'
MONGO_COLLECTION_NAME='access_log_collection'
MONGO_COLLECTION_SIZE=1*1024*1024 # 1MB
# Access Log format. Combined.
ACCESS_LOG_FORMAT = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
# Aliases for apache log parser
ACCESS_ALIAS = {
'%>s': 'status',
'%b': 'size',
'%h': 'client_ip',
'%r': 'request',
'%t': 'time',
'%{Referer}i': 'referer',
'%{User-Agent}i': 'user_agent' }
# Configure alias
def apachelog_parser_alias(self, name):
if ACCESS_ALIAS.has_key(name): return ACCESS_ALIAS[name]
else: return name
apachelog.parser.alias = apachelog_parser_alias
# Initialize mongo
def do_mongo_init():
# Connect allowing slave
connection = pymongo.Connection(MONGO_HOST, MONGO_PORT, slave_okay=True)
db = connection[MONGO_DB_NAME]
# Mongo will create the collection automacly, BUT we want to set
# capped collection to enable "log rotation".
# If you try to create a collection that already exists, it thows
# pymongo.CollectionInvalid
try:
collection = \
db.create_collection(MONGO_COLLECTION_NAME,
capped = True,
size = MONGO_COLLECTION_SIZE)
print "New collection created"
except pymongo.errors.CollectionInvalid:
# Already exists, get it.
collection = db[MONGO_DB_NAME]
return (connection, db, collection)
def main():
(mongo_connection, mongo_db, mongo_collection) = do_mongo_init()
parser = apachelog.parser(ACCESS_LOG_FORMAT)
# Process each line from stdin
while True:
logline = sys.stdin.readline()
if logline == '': break
try:
data = parser.parse(logline)
# Here you can process data to remove some items or similar.
# Date processing. Important to save it as datetime to allow date queries
(timestamp_str, timezone_str) = data['time'][1:-1].split(' ')
data['timestamp'] = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S")
data['timezone'] = int(timezone_str)
# I split the request
(data['method'], data['url'], data['proto']) = data['request'].split(' ')
# store the data in the Mongo DB
mongo_collection.insert(data)
except apachelog.ApacheLogParserError:
# ignore invalid lines
print >> sys.stderr, 'Invalid log format: "' + logline + '"'
mongo_connection.disconnect()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment