Created
January 17, 2013 16:20
-
-
Save ottomata/4557178 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
webrequest.channels = c1 | |
webrequest.sources = udp2log | |
webrequest.sinks = hdfs0 hdfs1 hdfs2 hdfs3 | |
# Channel which buffers events on disk | |
webrequest.channels.c1.type = memory | |
webrequest.channels.c1.transactionCapacity = 1000000 | |
webrequest.channels.c1.capacity = 1000000 | |
# UDPSource Multicast (custom) | |
# See: https://issues.apache.org/jira/browse/FLUME-1838 | |
webrequest.sources.udp2log.channels = c1 | |
webrequest.sources.udp2log.type = org.apache.flume.source.udp.UDPSource | |
webrequest.sources.udp2log.host = 233.58.59.1 | |
webrequest.sources.udp2log.port = 8420 | |
webrequest.sources.udp2log.multicast = true | |
webrequest.sources.udp2log.interceptors = ts onehost | |
# received timestamp interceptor | |
webrequest.sources.udp2log.interceptors.ts.type = timestamp | |
# onehost interceptor filters for events generated by cp1044 only. | |
webrequest.sources.udp2log.interceptors.onehost.type = regex_filter | |
webrequest.sources.udp2log.interceptors.onehost.regex = ^cp1044.wikimedia.org | |
# HDFS sink 0 | |
webrequest.sinks.hdfs0.channel = c1 | |
webrequest.sinks.hdfs0.type = hdfs | |
webrequest.sinks.hdfs0.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S | |
webrequest.sinks.hdfs0.hdfs.filePrefix = webrequest-0 | |
webrequest.sinks.hdfs0.hdfs.fileType = DataStream | |
webrequest.sinks.hdfs0.hdfs.round = true | |
webrequest.sinks.hdfs0.hdfs.roundValue = 15 | |
webrequest.sinks.hdfs0.hdfs.roundUnit = minute | |
webrequest.sinks.hdfs0.hdfs.rollInterval = 60 | |
webrequest.sinks.hdfs0.hdfs.rollCount = 0 | |
webrequest.sinks.hdfs0.hdfs.rollSize = 0 | |
webrequest.sinks.hdfs0.hdfs.batchSize = 1000 | |
webrequest.sinks.hdfs0.hdfs.txnEventMax = 1000 | |
# HDFS sink 1 | |
webrequest.sinks.hdfs1.channel = c1 | |
webrequest.sinks.hdfs1.type = hdfs | |
webrequest.sinks.hdfs1.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S | |
webrequest.sinks.hdfs1.hdfs.filePrefix = webrequest-1 | |
webrequest.sinks.hdfs1.hdfs.fileType = DataStream | |
webrequest.sinks.hdfs1.hdfs.round = true | |
webrequest.sinks.hdfs1.hdfs.roundValue = 15 | |
webrequest.sinks.hdfs1.hdfs.roundUnit = minute | |
webrequest.sinks.hdfs1.hdfs.rollInterval = 60 | |
webrequest.sinks.hdfs1.hdfs.rollCount = 0 | |
webrequest.sinks.hdfs1.hdfs.rollSize = 0 | |
webrequest.sinks.hdfs1.hdfs.batchSize = 1000 | |
webrequest.sinks.hdfs1.hdfs.txnEventMax = 1000 | |
# HDFS sink 2 | |
webrequest.sinks.hdfs2.channel = c1 | |
webrequest.sinks.hdfs2.type = hdfs | |
webrequest.sinks.hdfs2.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S | |
webrequest.sinks.hdfs2.hdfs.filePrefix = webrequest-2 | |
webrequest.sinks.hdfs2.hdfs.fileType = DataStream | |
webrequest.sinks.hdfs2.hdfs.round = true | |
webrequest.sinks.hdfs2.hdfs.roundValue = 15 | |
webrequest.sinks.hdfs2.hdfs.roundUnit = minute | |
webrequest.sinks.hdfs2.hdfs.rollInterval = 60 | |
webrequest.sinks.hdfs2.hdfs.rollCount = 0 | |
webrequest.sinks.hdfs2.hdfs.rollSize = 0 | |
webrequest.sinks.hdfs2.hdfs.batchSize = 1000 | |
webrequest.sinks.hdfs2.hdfs.txnEventMax = 1000 | |
# HDFS sink 3 | |
webrequest.sinks.hdfs3.channel = c1 | |
webrequest.sinks.hdfs3.type = hdfs | |
webrequest.sinks.hdfs3.hdfs.path = /user/otto/tmp/flume/%Y-%m-%d/%H.%M.%S | |
webrequest.sinks.hdfs3.hdfs.filePrefix = webrequest-3 | |
webrequest.sinks.hdfs3.hdfs.fileType = DataStream | |
webrequest.sinks.hdfs3.hdfs.round = true | |
webrequest.sinks.hdfs3.hdfs.roundValue = 15 | |
webrequest.sinks.hdfs3.hdfs.roundUnit = minute | |
webrequest.sinks.hdfs3.hdfs.rollInterval = 60 | |
webrequest.sinks.hdfs3.hdfs.rollCount = 0 | |
webrequest.sinks.hdfs3.hdfs.rollSize = 0 | |
webrequest.sinks.hdfs3.hdfs.batchSize = 1000 | |
webrequest.sinks.hdfs3.hdfs.txnEventMax = 1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download the Flume generated files from HDFS | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864694 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864888 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864735 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864735 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864695 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864889 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864736 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864736 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-3.1358437864696 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-0.1358437864737 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-2.1358437864737 | |
$ hadoop fs -get /user/otto/tmp/flume/2013-01-17/15.45.00/webrequest-1.1358437864890 | |
# find the first sequence number in the Flume/HDFS files: | |
$ start_seq=$(head -n 1 webrequest-* | awk '{print $2}' | grep -v webrequest- | grep -v -P '^$' | sort | head -n 1) && echo $start_seq | |
61748908 | |
# find the last sequence number in the Flume/HDFS files: | |
$ end_seq=$(tail -n 1 webrequest-* | awk '{print $2}' | grep -v webreq | grep -v -P '^$' | sort | tail -n 1) && echo $end_seq; | |
61827083 | |
# Count the number of lines in the Flume/HDFS files | |
$ cat webrequest-* | wc -l | |
19451 | |
# Could the number of lines in the raw cp1044 logs saved directly to disk | |
# between $start_seq and $end_seq | |
$ sed -n -e "/$start_seq/,/$end_seq/p" cp1044.webrequest.log | wc -l | |
78176 | |
19451 is about 25% of the expected 78176 of logs. Waahhhhhhhh :( | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment