Skip to content

Instantly share code, notes, and snippets.

@jaysoncena
Created March 10, 2016 13:43
Show Gist options
  • Save jaysoncena/d4f261e8c6ce03d3a8b5 to your computer and use it in GitHub Desktop.
Save jaysoncena/d4f261e8c6ce03d3a8b5 to your computer and use it in GitHub Desktop.
PySpark code to analyze Tomcat logs (can be also used with Apache HTTPd logs)
%pyspark
import re
from pyspark.sql.types import *
from pyspark.sql import Row
from datetime import datetime
access = "ABCD"
secret = "efghijk"
bucket = "s3-bucket-logs"
file_path = "server[1-3]/*.log"
## Matching the format
## 101.110.51.186 - - [07/Mar/2016:03:17:59 +0000] "GET /this/is/the/path HTTP/1.1" 500 196
pattern = re.compile(r"^([^ ]+) - - \[([^\]]+)\] \"([^ ]+) ([^ ]+) [^ ]+ ([^ ]+) ([^ ]+).*$")
lines = sc.textFile("s3n://{access}:{secret}@{bucket}/{file_path}".format(
access=access,
secret=secret,
bucket=bucket,
file_path=file_path
))
lineRows = lines.map(lambda l: pattern.match(l).groups()).map(lambda l: Row(
ipAddress=l[0],
dateTime=datetime.strptime(l[1].split(" ")[0], "%d/%b/%Y:%H:%M:%S"),
method=l[2],
url=l[3],
responseCode=l[4],
responseSize=l[5]
))
sqlCx = sqlContext.createDataFrame(lineRows)
sqlCx.registerTempTable("logs")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment