Skip to content

Instantly share code, notes, and snippets.

@hillar
Created April 27, 2016 17:52
Show Gist options
  • Save hillar/3e465998a86c92130e636c0a53c1b163 to your computer and use it in GitHub Desktop.
Save hillar/3e465998a86c92130e636c0a53c1b163 to your computer and use it in GitHub Desktop.
load & parse zip'ed apache logs and pump to elasticsearch
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from os import listdir
from os.path import isfile, join
import gzip
import apache_log_parser
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
def unzipundparse(pathname,filenames,parser):
line_parser = apache_log_parser.make_parser(parser)
for filename in filenames:
path = pathname +"/"+ filename
with gzip.open(path, 'r') as f:
counter = 0
for line in f:
counter = counter + 1
if counter > 300: break
data = line_parser(line)
data['path'] = path
doc = {
'_op_type': 'create',
'_index': 'apache-test',
'_type': 'apache',
'_source': data
}
yield doc
if __name__ == '__main__':
if len(sys.argv) != 2:
print "directory missing ;("
sys.exit()
mypath = sys.argv[1]
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
es = Elasticsearch()
parser = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
stats = bulk(es, unzipundparse(mypath, onlyfiles, parser))
print 'saved :', stats
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment