Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ottomata/6816039 to your computer and use it in GitHub Desktop.
Save ottomata/6816039 to your computer and use it in GitHub Desktop.
webrequest_esams0
38746878 records
MapReduce Total cumulative CPU time: 11 seconds 520 msec
Ended Job = job_1379030026553_0153
MapReduce Jobs Launched:
Job 0: Map: 97 Reduce: 26 Cumulative CPU: 3839.52 sec HDFS Read: 25972004039 HDFS Write: 1336075346 SUCCESS
Job 1: Map: 106 Reduce: 28 Cumulative CPU: 4060.18 sec HDFS Read: 27308086098 HDFS Write: 3192 SUCCESS
Job 2: Map: 6 Reduce: 1 Cumulative CPU: 11.52 sec HDFS Read: 9886 HDFS Write: 2 SUCCESS
Total MapReduce CPU Time Spent: 0 days 2 hours 11 minutes 51 seconds 220 msec
OK
_c0
6
Job 0: Map: 97 Reduce: 1 Cumulative CPU: 3379.42 sec HDFS Read: 25972004039 HDFS Write: 117 SUCCESS
Job 1: Map: 97 Reduce: 26 Cumulative CPU: 4570.52 sec HDFS Read: 25972004039 HDFS Write: 2583 SUCCESS
Job 2: Map: 113 Reduce: 1 Cumulative CPU: 4177.4 sec HDFS Read: 30153840760 HDFS Write: 118 SUCCESS
Job 3: Map: 97 Reduce: 26 Cumulative CPU: 4672.28 sec HDFS Read: 25972004039 HDFS Write: 2584 SUCCESS
Job 4: Map: 8 Reduce: 1 Cumulative CPU: 16.45 sec HDFS Read: 9476 HDFS Write: 140 SUCCESS
Job 5: Map: 6 Reduce: 1 Cumulative CPU: 13.92 sec HDFS Read: 9188 HDFS Write: 163 SUCCESS
Job 6: Map: 1 Reduce: 1 Cumulative CPU: 3.54 sec HDFS Read: 493 HDFS Write: 140 SUCCESS
Job 7: Map: 1 Reduce: 1 Cumulative CPU: 4.19 sec HDFS Read: 516 HDFS Write: 163 SUCCESS
Job 8: Map: 2 Reduce: 1 Cumulative CPU: 6.3 sec HDFS Read: 1009 HDFS Write: 40 SUCCESS
Total MapReduce CPU Time Spent: 0 days 4 hours 40 minutes 44 seconds 20 msec
OK
sequence sequence missing
14674712 14674714 1
14682058 14682060 1
Time taken: 548.884 seconds
MapReduce Total cumulative CPU time: 4 seconds 710 msec
Ended Job = job_1379030026553_0465
MapReduce Jobs Launched:
Job 0: Map: 1592 Reduce: 431 Cumulative CPU: 64713.9 sec HDFS Read: 430472626221 HDFS Write: 41458 SUCCESS
Job 1: Map: 1592 Reduce: 431 Cumulative CPU: 64889.64 sec HDFS Read: 430472626221 HDFS Write: 41466 SUCCESS
Job 2: Map: 1592 Reduce: 431 Cumulative CPU: 85013.92 sec HDFS Read: 430472626221 HDFS Write: 41800 SUCCESS
Job 3: Map: 1592 Reduce: 431 Cumulative CPU: 85648.47 sec HDFS Read: 430472626221 HDFS Write: 42092 SUCCESS
Job 4: Map: 16 Reduce: 1 Cumulative CPU: 32.26 sec HDFS Read: 264882 HDFS Write: 396 SUCCESS
Job 5: Map: 16 Reduce: 1 Cumulative CPU: 34.47 sec HDFS Read: 265166 HDFS Write: 396 SUCCESS
Job 6: Map: 1 Reduce: 1 Cumulative CPU: 3.46 sec HDFS Read: 749 HDFS Write: 396 SUCCESS
Job 7: Map: 1 Reduce: 1 Cumulative CPU: 4.15 sec HDFS Read: 749 HDFS Write: 396 SUCCESS
Job 8: Map: 2 Reduce: 1 Cumulative CPU: 4.71 sec HDFS Read: 1498 HDFS Write: 284 SUCCESS
Total MapReduce CPU Time Spent: 3 days 11 hours 25 minutes 44 seconds 980 msec
OK
hostname missing_from missing_to missing
cp3003.esams.wikimedia.org 19537005 19537007 1
cp3003.esams.wikimedia.org 19537314 19537316 1
cp3003.esams.wikimedia.org 19539168 19539170 1
cp3003.esams.wikimedia.org 19573875 19573877 1
cp3003.esams.wikimedia.org 28677724 28677726 1
cp3003.esams.wikimedia.org 111682679 111682681 1
Time taken: 4803.44 seconds
/* get the missing packets. Sample run:
sequence sequence missing
======== ======== =======
91965123 91965125 1
103290989 103290991 1
*/
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
select starting.sequence,
ending.sequence,
ending.sequence - starting.sequence - 1 as missing
from (select M.sequence,
rowSequence() as id
from webrequest_esams0 M
left outer join
webrequest_esams0 G on G.sequence = M.sequence + 1
and G.YEAR = 2013 and G.HOUR BETWEEN 2 and 12
left outer join
(select max(sequence) as max_seq
from webrequest_esams0
where year = 2013 and hour BETWEEN 2 and 12
) MaxSeq on MaxSeq.max_seq = M.sequence
where G.sequence is null
and MaxSeq.max_seq is null
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12
order by M.sequence
limit 1000000000
) starting
inner join
(select M.sequence,
rowSequence() as id
from webrequest_esams0 M
left outer join
webrequest_esams0 L on L.sequence = M.sequence - 1
and L.YEAR = 2013 and L.HOUR BETWEEN 2 and 12
left outer join
(select min(sequence) as min_seq
from webrequest_esams0
where year = 2013 and hour BETWEEN 2 and 12
) MinSeq on MinSeq.min_seq = M.sequence
where L.sequence is null
and MinSeq.min_seq is null
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12
order by M.sequence
limit 1000000000
) ending on ending.id = starting.id
;
/* find out whether there are missing packets (the query result > 2) Sample run:
6
this means there are 2 runs of missing sequence numbers - (n-2)/2
*/
select count(M.sequence)
from webrequest_esams0 M
left outer join
webrequest_esams0 G on G.sequence = M.sequence + 1
and G.YEAR = 2013 and G.HOUR BETWEEN 2 and 12
left outer join
webrequest_esams0 L on L.sequence = M.sequence - 1
and L.YEAR = 2013 and L.HOUR BETWEEN 2 and 12
where (G.sequence is null or L.sequence is null)
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12
;
# double join for or:
set tablename=varnish0;
select M.hostname,
count(*) - 1 as missing_sequence_runs
from ${hiveconf:tablename} M
left outer join
${hiveconf:tablename} G1 on G1.hostname = M.hostname
and G1.sequence = M.sequence + 1
and G1.DAY = 07
and G1.HOUR BETWEEN 16 and 23
left outer join
${hiveconf:tablename} G2 on G2.hostname = M.hostname
and G2.sequence = M.sequence + 1
and G2.DAY = 08
and G2.HOUR BETWEEN 0 and 16
where ( M.DAY = 07
and M.HOUR BETWEEN 16 and 23
and G1.sequence is null
) or
( M.DAY = 08
and M.HOUR BETWEEN 0 and 16
and G2.sequence is null
)
group by M.hostname
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment