Skip to content

Instantly share code, notes, and snippets.

@ottomata
Created January 17, 2013 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ottomata/4557178 to your computer and use it in GitHub Desktop.
Save ottomata/4557178 to your computer and use it in GitHub Desktop.
webrequest.channels = c1
webrequest.sources = udp2log
webrequest.sinks = hdfs0 hdfs1 hdfs2 hdfs3
# Channel which buffers events on disk
webrequest.channels.c1.type = memory
webrequest.channels.c1.transactionCapacity = 1000000
webrequest.channels.c1.capacity = 1000000
# UDPSource Multicast (custom)
# See: https://issues.apache.org/jira/browse/FLUME-1838
webrequest.sources.udp2log.channels = c1
webrequest.sources.udp2log.type = org.apache.flume.source.udp.UDPSource
webrequest.sources.udp2log.host = 233.58.59.1
webrequest.sources.udp2log.port = 8420
webrequest.sources.udp2log.multicast = true
webrequest.sources.udp2log.interceptors = ts onehost
# received timestamp interceptor
webrequest.sources.udp2log.interceptors.ts.type = timestamp
# onehost interceptor filters for events generated by cp1044 only.
webrequest.sources.udp2log.interceptors.onehost.type = regex_filter
webrequest.sources.udp2log.interceptors.onehost.regex = ^cp1044.wikimedia.org
# HDFS sink 0
webrequest.sinks.hdfs0.channel = c1
webrequest.sinks.hdfs0.type = hdfs
webrequest.sinks.hdfs0.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs0.hdfs.filePrefix = webrequest-0
webrequest.sinks.hdfs0.hdfs.fileType = DataStream
webrequest.sinks.hdfs0.hdfs.round = true
webrequest.sinks.hdfs0.hdfs.roundValue = 15
webrequest.sinks.hdfs0.hdfs.roundUnit = minute
webrequest.sinks.hdfs0.hdfs.rollInterval = 60
webrequest.sinks.hdfs0.hdfs.rollCount = 0
webrequest.sinks.hdfs0.hdfs.rollSize = 0
webrequest.sinks.hdfs0.hdfs.batchSize = 1000
webrequest.sinks.hdfs0.hdfs.txnEventMax = 1000
# HDFS sink 1
webrequest.sinks.hdfs1.channel = c1
webrequest.sinks.hdfs1.type = hdfs
webrequest.sinks.hdfs1.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs1.hdfs.filePrefix = webrequest-1
webrequest.sinks.hdfs1.hdfs.fileType = DataStream
webrequest.sinks.hdfs1.hdfs.round = true
webrequest.sinks.hdfs1.hdfs.roundValue = 15
webrequest.sinks.hdfs1.hdfs.roundUnit = minute
webrequest.sinks.hdfs1.hdfs.rollInterval = 60
webrequest.sinks.hdfs1.hdfs.rollCount = 0
webrequest.sinks.hdfs1.hdfs.rollSize = 0
webrequest.sinks.hdfs1.hdfs.batchSize = 1000
webrequest.sinks.hdfs1.hdfs.txnEventMax = 1000
# HDFS sink 2
webrequest.sinks.hdfs2.channel = c1
webrequest.sinks.hdfs2.type = hdfs
webrequest.sinks.hdfs2.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs2.hdfs.filePrefix = webrequest-2
webrequest.sinks.hdfs2.hdfs.fileType = DataStream
webrequest.sinks.hdfs2.hdfs.round = true
webrequest.sinks.hdfs2.hdfs.roundValue = 15
webrequest.sinks.hdfs2.hdfs.roundUnit = minute
webrequest.sinks.hdfs2.hdfs.rollInterval = 60
webrequest.sinks.hdfs2.hdfs.rollCount = 0
webrequest.sinks.hdfs2.hdfs.rollSize = 0
webrequest.sinks.hdfs2.hdfs.batchSize = 1000
webrequest.sinks.hdfs2.hdfs.txnEventMax = 1000
# HDFS sink 3
webrequest.sinks.hdfs3.channel = c1
webrequest.sinks.hdfs3.type = hdfs
webrequest.sinks.hdfs3.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs3.hdfs.filePrefix = webrequest-3
webrequest.sinks.hdfs3.hdfs.fileType = DataStream
webrequest.sinks.hdfs3.hdfs.round = true
webrequest.sinks.hdfs3.hdfs.roundValue = 15
webrequest.sinks.hdfs3.hdfs.roundUnit = minute
webrequest.sinks.hdfs3.hdfs.rollInterval = 60
webrequest.sinks.hdfs3.hdfs.rollCount = 0
webrequest.sinks.hdfs3.hdfs.rollSize = 0
webrequest.sinks.hdfs3.hdfs.batchSize = 1000
webrequest.sinks.hdfs3.hdfs.txnEventMax = 1000
# download the Flume generated files from HDFS
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864694
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864888
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864735
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864735
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864695
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864889
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864736
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864736
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864696
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864737
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864737
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864890
# find the first sequence number in the Flume/HDFS files:
$ start_seq=$(head -n 1 webrequest-* | awk '{print $2}' | grep -v webrequest- | grep -v -P '^$' | sort | head -n 1) && echo $start_seq
61748908
# find the last sequence number in the Flume/HDFS files:
$ end_seq=$(tail -n 1 webrequest-* | awk '{print $2}' | grep -v webreq | grep -v -P '^$' | sort | tail -n 1) && echo $end_seq;
61827083
# Count the number of lines in the Flume/HDFS files
$ cat webrequest-* | wc -l
19451
# Could the number of lines in the raw cp1044 logs saved directly to disk
# between $start_seq and $end_seq
$ sed -n -e "/$start_seq/,/$end_seq/p" cp1044.webrequest.log | wc -l
78176
19451 is about 25% of the expected 78176 of logs. Waahhhhhhhh :(
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment