Skip to content

Instantly share code, notes, and snippets.

@chrisa
Created August 25, 2009 21:26
Show Gist options
  • Save chrisa/175041 to your computer and use it in GitHub Desktop.
Save chrisa/175041 to your computer and use it in GitHub Desktop.
require 'java'
require 'lib/cascading-core-1.1.0-54.jar'
require 'lib/cascading-1.1.0-54.jar'
require 'lib/hadoop-0.20.0-core.jar'
require 'lib/jgrapht-jdk1.6.jar'
require 'lib/log4j-1.2.15.jar'
require 'lib/commons-logging-1.0.4.jar'
require 'lib/commons-logging-api-1.0.4.jar'
require 'lib/commons-codec-1.3.jar'
require 'lib/commons-httpclient-3.0.1.jar'
def cascading
Java::Cascading
end
import java.util.Properties
import cascading.flow.Flow
import cascading.flow.FlowConnector
import cascading.operation.regex.RegexParser
import cascading.pipe.Each
import cascading.pipe.Pipe
import cascading.scheme.TextLine
import cascading.tap.Hfs
import cascading.tap.Lfs
import cascading.tap.Tap
import cascading.tuple.Fields
class Main
def self.run
inputPath = ARGV[0]
outputPath = ARGV[1]
localLogTap = Lfs.new(TextLine.new, inputPath)
apacheFields = Fields.new(["ip", "time", "method", "event", "status", "size"].to_java(:string))
apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$"
allGroups = [1, 2, 3, 4, 5, 6].to_java(:int)
parser = RegexParser.new( apacheFields, apacheRegex, allGroups )
importPipe = Each.new( "parser", Fields.new(["line"].to_java(:string)), parser )
remoteLogTap = Hfs.new( TextLine.new, outputPath )
properties = Properties.new
parsedLogFlow = FlowConnector.new( properties ).connect( localLogTap, remoteLogTap, importPipe )
parsedLogFlow.writeDOT( "logparser.dot" )
parsedLogFlow.start
parsedLogFlow.complete
end
end
Main.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment