public
Created

  • Download Gist
flume.conf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
webrequest.channels = c1
webrequest.sources = udp2log
webrequest.sinks = hdfs0 hdfs1 hdfs2 hdfs3
 
# Channel which buffers events on disk
webrequest.channels.c1.type = memory
webrequest.channels.c1.transactionCapacity = 1000000
webrequest.channels.c1.capacity = 1000000
 
 
# UDPSource Multicast (custom)
# See: https://issues.apache.org/jira/browse/FLUME-1838
webrequest.sources.udp2log.channels = c1
webrequest.sources.udp2log.type = org.apache.flume.source.udp.UDPSource
webrequest.sources.udp2log.host = 233.58.59.1
webrequest.sources.udp2log.port = 8420
webrequest.sources.udp2log.multicast = true
 
webrequest.sources.udp2log.interceptors = ts onehost
 
# received timestamp interceptor
webrequest.sources.udp2log.interceptors.ts.type = timestamp
 
# onehost interceptor filters for events generated by cp1044 only.
webrequest.sources.udp2log.interceptors.onehost.type = regex_filter
webrequest.sources.udp2log.interceptors.onehost.regex = ^cp1044.wikimedia.org
 
 
 
# HDFS sink 0
webrequest.sinks.hdfs0.channel = c1
webrequest.sinks.hdfs0.type = hdfs
webrequest.sinks.hdfs0.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs0.hdfs.filePrefix = webrequest-0
webrequest.sinks.hdfs0.hdfs.fileType = DataStream
webrequest.sinks.hdfs0.hdfs.round = true
webrequest.sinks.hdfs0.hdfs.roundValue = 15
webrequest.sinks.hdfs0.hdfs.roundUnit = minute
webrequest.sinks.hdfs0.hdfs.rollInterval = 60
webrequest.sinks.hdfs0.hdfs.rollCount = 0
webrequest.sinks.hdfs0.hdfs.rollSize = 0
webrequest.sinks.hdfs0.hdfs.batchSize = 1000
webrequest.sinks.hdfs0.hdfs.txnEventMax = 1000
 
# HDFS sink 1
webrequest.sinks.hdfs1.channel = c1
webrequest.sinks.hdfs1.type = hdfs
webrequest.sinks.hdfs1.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs1.hdfs.filePrefix = webrequest-1
webrequest.sinks.hdfs1.hdfs.fileType = DataStream
webrequest.sinks.hdfs1.hdfs.round = true
webrequest.sinks.hdfs1.hdfs.roundValue = 15
webrequest.sinks.hdfs1.hdfs.roundUnit = minute
webrequest.sinks.hdfs1.hdfs.rollInterval = 60
webrequest.sinks.hdfs1.hdfs.rollCount = 0
webrequest.sinks.hdfs1.hdfs.rollSize = 0
webrequest.sinks.hdfs1.hdfs.batchSize = 1000
webrequest.sinks.hdfs1.hdfs.txnEventMax = 1000
 
# HDFS sink 2
webrequest.sinks.hdfs2.channel = c1
webrequest.sinks.hdfs2.type = hdfs
webrequest.sinks.hdfs2.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs2.hdfs.filePrefix = webrequest-2
webrequest.sinks.hdfs2.hdfs.fileType = DataStream
webrequest.sinks.hdfs2.hdfs.round = true
webrequest.sinks.hdfs2.hdfs.roundValue = 15
webrequest.sinks.hdfs2.hdfs.roundUnit = minute
webrequest.sinks.hdfs2.hdfs.rollInterval = 60
webrequest.sinks.hdfs2.hdfs.rollCount = 0
webrequest.sinks.hdfs2.hdfs.rollSize = 0
webrequest.sinks.hdfs2.hdfs.batchSize = 1000
webrequest.sinks.hdfs2.hdfs.txnEventMax = 1000
 
# HDFS sink 3
webrequest.sinks.hdfs3.channel = c1
webrequest.sinks.hdfs3.type = hdfs
webrequest.sinks.hdfs3.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S
webrequest.sinks.hdfs3.hdfs.filePrefix = webrequest-3
webrequest.sinks.hdfs3.hdfs.fileType = DataStream
webrequest.sinks.hdfs3.hdfs.round = true
webrequest.sinks.hdfs3.hdfs.roundValue = 15
webrequest.sinks.hdfs3.hdfs.roundUnit = minute
webrequest.sinks.hdfs3.hdfs.rollInterval = 60
webrequest.sinks.hdfs3.hdfs.rollCount = 0
webrequest.sinks.hdfs3.hdfs.rollSize = 0
webrequest.sinks.hdfs3.hdfs.batchSize = 1000
webrequest.sinks.hdfs3.hdfs.txnEventMax = 1000
investigation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# download the Flume generated files from HDFS
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864694
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864888
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864735
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864735
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864695
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864889
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864736
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864736
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864696
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864737
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864737
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864890
 
 
 
# find the first sequence number in the Flume/HDFS files:
$ start_seq=$(head -n 1 webrequest-* | awk '{print $2}' | grep -v webrequest- | grep -v -P '^$' | sort | head -n 1) && echo $start_seq
61748908
 
# find the last sequence number in the Flume/HDFS files:
$ end_seq=$(tail -n 1 webrequest-* | awk '{print $2}' | grep -v webreq | grep -v -P '^$' | sort | tail -n 1) && echo $end_seq;
61827083
 
# Count the number of lines in the Flume/HDFS files
$ cat webrequest-* | wc -l
19451
 
# Could the number of lines in the raw cp1044 logs saved directly to disk
# between $start_seq and $end_seq
$ sed -n -e "/$start_seq/,/$end_seq/p" cp1044.webrequest.log | wc -l
78176
 
 
 
19451 is about 25% of the expected 78176 of logs. Waahhhhhhhh :(

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.