Skip to content

Instantly share code, notes, and snippets.

@robperc
Last active February 28, 2016 04:24
Show Gist options
  • Save robperc/b9c041e627ee5cdd89b2 to your computer and use it in GitHub Desktop.
Save robperc/b9c041e627ee5cdd89b2 to your computer and use it in GitHub Desktop.
Concurrent Apache Log Parsing w/ multiprocessing module
"""
Concurrently parse Apache logs and generate dictionary containing aggregate of 'ip / hostname: frequency' pairs.
"""
from multiprocessing import Pool
from collections import Counter
def readLog(path):
"""
Counts occurences of unique IPs and hostnames found in apache log at input path.
Args:
path (str): Path to input apache log.
Returns:
Dictionary containing 'unique_ip: occurences' pairs.
"""
ips = Counter()
with open(path, "r") as infile:
for line in infile:
ip = line.split(' ')[0].strip()
ips[ip] += 1
return ips
def readLogs(paths):
"""
Concurrently counts occurences of unique IPs and hostnames found in apache logs.
Args:
paths (list(str)): Paths to apache logs to parse.
Returns:
Dictionary containing aggregate of 'unique_ip: occurences' pairs from each log.
"""
pool = Pool(processes=len(paths))
multiple_results = [pool.apply_async(readLog, (path,)) for path in paths]
results = reduce(Counter.__add__, [res.get() for res in multiple_results])
return results
files = ("/tmp/NASA_access_log_Jul95", "/tmp/NASA_access_log_Aug95")
for ip, freq in readLogs(files).most_common():
print "%s: %s" % (ip, str(freq))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment