Skip to content

Instantly share code, notes, and snippets.

@earthquakesan
Created October 13, 2017 07:37
Show Gist options
  • Save earthquakesan/e35653927e4c67d88ad6d9dc32e3aa29 to your computer and use it in GitHub Desktop.
Save earthquakesan/e35653927e4c67d88ad6d9dc32e3aa29 to your computer and use it in GitHub Desktop.
Apache Log Parsing in Spark
import org.apache.spark.{SparkConf, SparkContext}
object ICTCSApacheLogParsing {
def main(args: Array[String]): Unit = {
val config = new SparkConf().setMaster("local[4]").setAppName("My Application")
val sc = new SparkContext(config)
val apacheLog = sc.textFile("/path/to/access_log")
case class ApacheLog(ipAddress: String, date: String, statusCode: String)
val apacheLogRegex = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?$".r
val apacheLogMessages = apacheLog map {
l => {
l match {
case apacheLogRegex(
ipAddress,
_,
_,
date,
method,
path,
protocol,
status,
bytes,
_,
browser
) => ApacheLog(ipAddress, date, status)
}
}
}
// Number of requests per day
val dates = apacheLogMessages.map( m => (m.date.substring(0,11), 1))
val requestsPerDate = dates.reduceByKey(_ + _)
requestsPerDate.collect().foreach(println)
// Number of requests per IP address over all the log duration
val ipAddresses = apacheLogMessages.map(m => (m.ipAddress, 1))
val requestsPerIp = ipAddresses.reduceByKey(_ + _)
requestsPerIp.collect().foreach(println)
// Amount of failed requests
val failedRequestsCount = apacheLogMessages.map(m => m.statusCode).filter(code => ! code.startsWith("2")).count()
println(failedRequestsCount)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment