ottomata/average_loss.sql

## average_loss.sql
set start_day=16;
set end_day=17;
set table_name=varnish2;

 select M.hostname,
        M.lower_seq,
        M.upper_seq,
        M.total_seqs,
        (M.expected_seqs - M.total_seqs) as missing_seqs,
        (M.expected_seqs - M.total_seqs) / M.expected_seqs as average_loss
   from (select T.hostname,
                min(T.sequence) as lower_seq,
                max(T.sequence) as upper_seq,
                max(T.sequence) - min(T.sequence) + 1 as expected_seqs,
                count(*)        as total_seqs
           from ${hiveconf:table_name} T
          where T.YEAR = 2013
            and T.DAY  between ${hiveconf:start_day} and ${hiveconf:end_day}
            and T.sequence <> 0
          group by T.hostname
        ) M
;

## count_missing_sequence_runs.sql
    set start_day=16;
    set end_day=17;
    set table_name=varnish2;

/* Find missing sequence runs counts per hostname */

 select M.hostname,
        count(*) - 1 as missing_sequence_runs
   from ${hiveconf:tablename} M
            left outer join
        ${hiveconf:tablename} G on G.hostname = M.hostname
                               and G.sequence = M.sequence + 1
                               and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
                               and G.sequence <> 0
  where G.sequence is null
    and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} and M.sequence <> 0
  group by M.hostname
;


## missing_sequence_runs.sql

    set start_day=16;
    set end_day=17;
    set table_name=varnish2;

    add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar;
    add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar;
 create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';


/* Get the missing sequence runs. Sample run:

sequence    sequence    missing
========    ========    =======
91965123    91965125    1
103290989   103290991   1

 */
select starting.hostname,
       starting.sequence                       as missing_from,
       ending.sequence                         as missing_to,
       ending.sequence - starting.sequence - 1 as missing

  from (select M.sequence,
               rowSequence() as id,
               M.hostname
          from ${hiveconf:table_name} M
                   left outer join
               ${hiveconf:table_name} G on G.sequence = M.sequence + 1
                                      and G.hostname = M.hostname
                                      and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
                                      and G.sequence <> 0
                   left outer join
               (select max(sequence) as max_seq,
                       hostname
                  from ${hiveconf:table_name}
                 where year = 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
                                      and sequence <> 0
                 group by hostname
               ) MaxSeq               on MaxSeq.max_seq = M.sequence
                                     and MaxSeq.hostname = M.hostname
         where G.sequence is null
           and MaxSeq.max_seq is null
           and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
           and M.sequence <> 0
         order by M.hostname, M.sequence
         limit 1000000000
       ) starting
           inner join
       (select M.sequence,
               rowSequence() as id,
               M.hostname
          from ${hiveconf:table_name} M
                   left outer join
               ${hiveconf:table_name} L on L.sequence = M.sequence - 1
                                      and L.hostname = M.hostname
                                      and L.YEAR = 2013 and L.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
                                      and L.sequence <> 0
                   left outer join
               (select min(sequence) as min_seq,
                       hostname
                  from ${hiveconf:table_name}
                 where year > 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
                   and sequence <> 0
                 group by hostname
               ) MinSeq                on MinSeq.min_seq = M.sequence
                                      and MinSeq.hostname = M.hostname
         where L.sequence is null
           and MinSeq.min_seq is null
           and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day}
           and M.sequence <> 0
         order by M.hostname, M.sequence
         limit 1000000000
       ) ending        on ending.id = starting.id
                      and ending.hostname = starting.hostname
;
	set start_day=16;
	set end_day=17;
	set table_name=varnish2;

	select M.hostname,
	M.lower_seq,
	M.upper_seq,
	M.total_seqs,
	(M.expected_seqs - M.total_seqs) as missing_seqs,
	(M.expected_seqs - M.total_seqs) / M.expected_seqs as average_loss
	from (select T.hostname,
	min(T.sequence) as lower_seq,
	max(T.sequence) as upper_seq,
	max(T.sequence) - min(T.sequence) + 1 as expected_seqs,
	count(*) as total_seqs
	from ${hiveconf:table_name} T
	where T.YEAR = 2013
	and T.DAY between ${hiveconf:start_day} and ${hiveconf:end_day}
	and T.sequence <> 0
	group by T.hostname
	) M
	;