Created
October 13, 2017 07:37
-
-
Save earthquakesan/e35653927e4c67d88ad6d9dc32e3aa29 to your computer and use it in GitHub Desktop.
Apache Log Parsing in Spark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.{SparkConf, SparkContext} | |
object ICTCSApacheLogParsing { | |
def main(args: Array[String]): Unit = { | |
val config = new SparkConf().setMaster("local[4]").setAppName("My Application") | |
val sc = new SparkContext(config) | |
val apacheLog = sc.textFile("/path/to/access_log") | |
case class ApacheLog(ipAddress: String, date: String, statusCode: String) | |
val apacheLogRegex = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?$".r | |
val apacheLogMessages = apacheLog map { | |
l => { | |
l match { | |
case apacheLogRegex( | |
ipAddress, | |
_, | |
_, | |
date, | |
method, | |
path, | |
protocol, | |
status, | |
bytes, | |
_, | |
browser | |
) => ApacheLog(ipAddress, date, status) | |
} | |
} | |
} | |
// Number of requests per day | |
val dates = apacheLogMessages.map( m => (m.date.substring(0,11), 1)) | |
val requestsPerDate = dates.reduceByKey(_ + _) | |
requestsPerDate.collect().foreach(println) | |
// Number of requests per IP address over all the log duration | |
val ipAddresses = apacheLogMessages.map(m => (m.ipAddress, 1)) | |
val requestsPerIp = ipAddresses.reduceByKey(_ + _) | |
requestsPerIp.collect().foreach(println) | |
// Amount of failed requests | |
val failedRequestsCount = apacheLogMessages.map(m => m.statusCode).filter(code => ! code.startsWith("2")).count() | |
println(failedRequestsCount) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment