Created
January 15, 2012 04:28
-
-
Save keymon/1614315 to your computer and use it in GitHub Desktop.
This script reads apache log files entries from the standard input, parses them and stores them in a mongo database.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
# This script reads apache log files entries from the standard input, parses them | |
# and stores them in a mongo database. | |
# | |
# Requires: | |
# - pymongo | |
# - apachelog: http://code.google.com/p/apachelog | |
# | |
import sys | |
import apachelog | |
import pymongo | |
from datetime import datetime | |
# Mongo DB config | |
MONGO_HOST='localhost' | |
MONGO_PORT=27017 | |
MONGO_DB_NAME='logging_db' | |
MONGO_COLLECTION_NAME='access_log_collection' | |
MONGO_COLLECTION_SIZE=1*1024*1024 # 1MB | |
# Access Log format. Combined. | |
ACCESS_LOG_FORMAT = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' | |
# Aliases for apache log parser | |
ACCESS_ALIAS = { | |
'%>s': 'status', | |
'%b': 'size', | |
'%h': 'client_ip', | |
'%r': 'request', | |
'%t': 'time', | |
'%{Referer}i': 'referer', | |
'%{User-Agent}i': 'user_agent' } | |
# Configure alias | |
def apachelog_parser_alias(self, name): | |
if ACCESS_ALIAS.has_key(name): return ACCESS_ALIAS[name] | |
else: return name | |
apachelog.parser.alias = apachelog_parser_alias | |
# Initialize mongo | |
def do_mongo_init(): | |
# Connect allowing slave | |
connection = pymongo.Connection(MONGO_HOST, MONGO_PORT, slave_okay=True) | |
db = connection[MONGO_DB_NAME] | |
# Mongo will create the collection automacly, BUT we want to set | |
# capped collection to enable "log rotation". | |
# If you try to create a collection that already exists, it thows | |
# pymongo.CollectionInvalid | |
try: | |
collection = \ | |
db.create_collection(MONGO_COLLECTION_NAME, | |
capped = True, | |
size = MONGO_COLLECTION_SIZE) | |
print "New collection created" | |
except pymongo.errors.CollectionInvalid: | |
# Already exists, get it. | |
collection = db[MONGO_DB_NAME] | |
return (connection, db, collection) | |
def main(): | |
(mongo_connection, mongo_db, mongo_collection) = do_mongo_init() | |
parser = apachelog.parser(ACCESS_LOG_FORMAT) | |
# Process each line from stdin | |
while True: | |
logline = sys.stdin.readline() | |
if logline == '': break | |
try: | |
data = parser.parse(logline) | |
# Here you can process data to remove some items or similar. | |
# Date processing. Important to save it as datetime to allow date queries | |
(timestamp_str, timezone_str) = data['time'][1:-1].split(' ') | |
data['timestamp'] = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S") | |
data['timezone'] = int(timezone_str) | |
# I split the request | |
(data['method'], data['url'], data['proto']) = data['request'].split(' ') | |
# store the data in the Mongo DB | |
mongo_collection.insert(data) | |
except apachelog.ApacheLogParserError: | |
# ignore invalid lines | |
print >> sys.stderr, 'Invalid log format: "' + logline + '"' | |
mongo_connection.disconnect() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment