Skip to content

Instantly share code, notes, and snippets.

@ottomata
Last active May 27, 2016 15:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ottomata/7048012 to your computer and use it in GitHub Desktop.
Save ottomata/7048012 to your computer and use it in GitHub Desktop.
set start_day=16;
set end_day=17;
set table_name=varnish2;
select M.hostname,
M.lower_seq,
M.upper_seq,
M.total_seqs,
(M.expected_seqs - M.total_seqs) as missing_seqs,
(M.expected_seqs - M.total_seqs) / M.expected_seqs as average_loss
from (select T.hostname,
min(T.sequence) as lower_seq,
max(T.sequence) as upper_seq,
max(T.sequence) - min(T.sequence) + 1 as expected_seqs,
count(*) as total_seqs
from ${hiveconf:table_name} T
where T.YEAR = 2013
and T.DAY between ${hiveconf:start_day} and ${hiveconf:end_day}
and T.sequence <> 0
group by T.hostname
) M
;
set start_day=16;
set end_day=17;
set table_name=varnish2;
/* Find missing sequence runs counts per hostname */
select M.hostname,
count(*) - 1 as missing_sequence_runs
from ${hiveconf:tablename} M
left outer join
${hiveconf:tablename} G on G.hostname = M.hostname
and G.sequence = M.sequence + 1
and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and G.sequence <> 0
where G.sequence is null
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} and M.sequence <> 0
group by M.hostname
;
set start_day=16;
set end_day=17;
set table_name=varnish2;
add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar;
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar;
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
/* Get the missing sequence runs. Sample run:
sequence sequence missing
======== ======== =======
91965123 91965125 1
103290989 103290991 1
*/
select starting.hostname,
starting.sequence as missing_from,
ending.sequence as missing_to,
ending.sequence - starting.sequence - 1 as missing
from (select M.sequence,
rowSequence() as id,
M.hostname
from ${hiveconf:table_name} M
left outer join
${hiveconf:table_name} G on G.sequence = M.sequence + 1
and G.hostname = M.hostname
and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and G.sequence <> 0
left outer join
(select max(sequence) as max_seq,
hostname
from ${hiveconf:table_name}
where year = 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and sequence <> 0
group by hostname
) MaxSeq on MaxSeq.max_seq = M.sequence
and MaxSeq.hostname = M.hostname
where G.sequence is null
and MaxSeq.max_seq is null
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and M.sequence <> 0
order by M.hostname, M.sequence
limit 1000000000
) starting
inner join
(select M.sequence,
rowSequence() as id,
M.hostname
from ${hiveconf:table_name} M
left outer join
${hiveconf:table_name} L on L.sequence = M.sequence - 1
and L.hostname = M.hostname
and L.YEAR = 2013 and L.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and L.sequence <> 0
left outer join
(select min(sequence) as min_seq,
hostname
from ${hiveconf:table_name}
where year > 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and sequence <> 0
group by hostname
) MinSeq on MinSeq.min_seq = M.sequence
and MinSeq.hostname = M.hostname
where L.sequence is null
and MinSeq.min_seq is null
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
and M.sequence <> 0
order by M.hostname, M.sequence
limit 1000000000
) ending on ending.id = starting.id
and ending.hostname = starting.hostname
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment