jaysoncena/pyspark-apache-logs.py

## pyspark-apache-logs.py
%pyspark


import re
from pyspark.sql.types import *
from pyspark.sql import Row
from datetime import datetime


access = "ABCD"
secret = "efghijk"
bucket = "s3-bucket-logs"
file_path = "server[1-3]/*.log"

## Matching the format
## 101.110.51.186 - - [07/Mar/2016:03:17:59 +0000] "GET /this/is/the/path HTTP/1.1" 500 196
pattern = re.compile(r"^([^ ]+) - - \[([^\]]+)\] \"([^ ]+) ([^ ]+) [^ ]+ ([^ ]+) ([^ ]+).*$")

lines = sc.textFile("s3n://{access}:{secret}@{bucket}/{file_path}".format(
        access=access,
        secret=secret,
        bucket=bucket,
        file_path=file_path
    ))

lineRows = lines.map(lambda l: pattern.match(l).groups()).map(lambda l: Row(
        ipAddress=l[0],
        dateTime=datetime.strptime(l[1].split(" ")[0], "%d/%b/%Y:%H:%M:%S"),
        method=l[2],
        url=l[3],
        responseCode=l[4],
        responseSize=l[5]
    ))

sqlCx = sqlContext.createDataFrame(lineRows)
sqlCx.registerTempTable("logs")
	%pyspark


	import re
	from pyspark.sql.types import *
	from pyspark.sql import Row
	from datetime import datetime


	access = "ABCD"
	secret = "efghijk"
	bucket = "s3-bucket-logs"
	file_path = "server[1-3]/*.log"

	## Matching the format
	## 101.110.51.186 - - [07/Mar/2016:03:17:59 +0000] "GET /this/is/the/path HTTP/1.1" 500 196
	pattern = re.compile(r"^([^ ]+) - - \[([^\]]+)\] \"([^ ]+) ([^ ]+) [^ ]+ ([^ ]+) ([^ ]+).*$")

	lines = sc.textFile("s3n://{access}:{secret}@{bucket}/{file_path}".format(
	access=access,
	secret=secret,
	bucket=bucket,
	file_path=file_path
	))

	lineRows = lines.map(lambda l: pattern.match(l).groups()).map(lambda l: Row(
	ipAddress=l[0],
	dateTime=datetime.strptime(l[1].split(" ")[0], "%d/%b/%Y:%H:%M:%S"),
	method=l[2],
	url=l[3],
	responseCode=l[4],
	responseSize=l[5]
	))

	sqlCx = sqlContext.createDataFrame(lineRows)
	sqlCx.registerTempTable("logs")