Skip to content

Instantly share code, notes, and snippets.

Created May 6, 2014 15:37
Show Gist options
  • Save nagyv/fac008b5cfb8c84d21f2 to your computer and use it in GitHub Desktop.
Save nagyv/fac008b5cfb8c84d21f2 to your computer and use it in GitHub Desktop.
A python script for downloading and processing Amazon S3 logs using goaccess
import os
from boto.s3.connection import S3Connection
import subprocess
from datetime import datetime, date
import argparse
import tempfile
import json
parser = argparse.ArgumentParser(description="Downloads logs from S3, and parses them with goaccess.")
parser.add_argument("aws_key", help="Amazon identification key", default=None)
parser.add_argument("aws_secret", help="Amazon identification key secret", default=None)
parser.add_argument("input_bucket", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)")
parser.add_argument("input_path", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)")
parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true", default=False)
parser.add_argument("-d", "--date", help="The date to run the report on in YYYY-MM-DD format")
def log(*msg):
if args.verbose:
print msg
class GoAccess(object):
We download the log files from S3, then concatenate them, and pass the results to goaccess. It gives back a JSON
that we can handle further.
def __init__(self, input_bucket, input_path, date_filter, aws_keys=None):
:param aws_keys: a list of (aws key, secret key)
self.input_bucket = input_bucket
self.input_path = input_path
self.date_filter = date_filter
self.aws_keys = aws_keys
def _create_goconfig(self):
Creates a temporary goaccessrc file with the necessary formatting
self.configfile = tempfile.NamedTemporaryFile()
self.configfile.write("""color_scheme 0
date_format %d/%b/%Y
log_format %^ %^ [%d:%^] %h %^ %^ %^ %^ "%^ %r %^" %s %^ %b %^ %^ %^ "%^" "%u" %^
def is_needed(self, filename):
Only files that return true will be processed.
By default the file name should start with `access_log` and should contain the date filtered.
return "access_log-" in filename and self.date_filter.strftime("%Y-%m-%d") in filename
def concat_files(self, outfile, filename):
with open(filename) as infile:
def download_logs(self):
Downloads logs from S3 using Boto.
if self.aws_keys:
conn = S3Connection(*self.aws_keys)
conn = S3Connection()
mybucket = conn.get_bucket(self.input_bucket)
tempdir = tempfile.mkdtemp()
for item in mybucket.list(prefix=self.input_path):
if self.is_needed(item.key):
local_file = os.path.join(tempdir, item.key.split("/")[-1])
log("Downloading %s to %s" % (item.key, local_file))
yield local_file
def process_results(self, json):
This is the main method to be overwritten by implementors.
:param json: A JSON object result from goaccess to be processed further.
def run(self):
Just do it!
logs = self.download_logs()
with tempfile.NamedTemporaryFile() as tempLog:
for downloaded in logs:
self.concat_files(tempLog, downloaded)
log("Creating report")
tempLog.flush() # needed to have the temp file written for sure
server = subprocess.Popen(["goaccess", "-f",, "-o", "json", "-p",], stdout=subprocess.PIPE)
out, err = server.communicate()
return True
if __name__ == "__main__":
args = parser.parse_args()
given_date = datetime.strptime(, "%Y-%m-%d")
given_date =
if args.aws_key and args.aws_secret:
aws_keys = (args.aws_key, args.aws_secret)
aws_keys = None
processor = GoAccess(args.input_bucket, args.input_path, given_date, aws_keys)
Copy link

I am getting error while running?I have python 2.7; pip version 8.1 ;boto 2

python -v -d 2017-12-22 aws_key='' aws_secret='' input_bucket=s3://demo input_path=s3://demo/logs


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment