jmaher/gist:63657c23d0f2149cfed577005490f583

## gistfile1.txt
DECLARE start_date DATE DEFAULT "2020-04-20";
DECLARE end_date DATE DEFAULT "2020-04-24";
-- granularity is date_part: ideally 'day' or 'month', could be 'week'
DECLARE granularity STRING DEFAULT "day";
DECLARE types ARRAY <STRING>;
set types = ['gecko-t/t-linux-xlarge', 'gecko-t/t-linux-large', 'gecko-t/t-win10-64', 'gecko-t/t-win10-64-gpu-s', 'gecko-t/t-win7-32', 'gecko-t/t-win7-32-gpu'];
set types = ['gecko-t/t-win10-64-gpu-s'];

SELECT
  orphaned.date,
  orphaned.workerPoolId,
  sum(orphaned.hours) as orphaned_hours,
  sum(failed_reboot.bill_hours) as bill_hours,
  sum(failed_reboot.worker_hours) as worker_hours,
  sum(orphaned.hours)+sum(failed_reboot.bill_hours)-(sum(failed_reboot.worker_hours)) as hours_orphaned
FROM (
  SELECT
    failed_data.date as date,
    failed_data.workerPoolId as workerPoolId,
    sum(failed_data.bill_hours) as bill_hours,
    sum(worker_seconds)/3600 as worker_hours from (
    SELECT
      CASE
        WHEN granularity = 'day' THEN DATE_TRUNC(DATE(wm.timestamp), DAY)
        WHEN granularity = 'month' THEN DATE_TRUNC(DATE(wm.timestamp), MONTH)
      END as date,
      wm.workerPoolId,
      aws.hours as bill_hours,
      TIMESTAMP_DIFF(max(wm.timestamp), min(wm.timestamp), SECOND) as worker_seconds
    FROM
      taskclusteretl.worker_metrics as wm, taskclusteretl.tmp_aws_instance_hours as aws
    WHERE
      wm.workerPoolId in UNNEST(types)
      AND date(wm.timestamp)>=start_date
      AND date(wm.timestamp)<end_date
      AND aws.date>=start_date
      AND aws.date<end_date
      AND aws.workerPoolId in UNNEST(types)
      AND wm.eventType not like 'taskQueue%'
      AND wm.workerId in (
        SELECT instanceId from taskclusteretl.tmp_aws_instance_hours where workerPoolId in UNNEST(types) and date>=start_date and date<end_date and hours>=20)
      AND wm.workerId=aws.instanceId
    GROUP BY
      date,
      wm.workerPoolId,
      aws.hours
    ORDER BY
      date asc
     ) as failed_data
  GROUP BY
    failed_data.date,
    failed_data.workerPoolId
  ORDER BY
    date asc
) as failed_reboot, (
  SELECT
    date,
    workerPoolId,
    sum(hours) as hours
  FROM
    taskclusteretl.tmp_aws_instance_hours
  WHERE
    workerPoolId in UNNEST(types)
    AND date>=start_date
    AND date<end_date
    AND instanceId not in (
      SELECT workerId from taskclusteretl.worker_metrics where workerPoolId in UNNEST(types) and date(timestamp)>=start_date and date(timestamp)<end_date)
  GROUP BY
    date, workerPoolId
  ORDER BY
    date asc
) as orphaned
WHERE
  failed_reboot.date=orphaned.date
  AND failed_reboot.workerPoolId=orphaned.workerPoolId
GROUP BY
  orphaned.date, orphaned.workerPoolId
ORDER BY
  date asc
	DECLARE start_date DATE DEFAULT "2020-04-20";
	DECLARE end_date DATE DEFAULT "2020-04-24";
	-- granularity is date_part: ideally 'day' or 'month', could be 'week'
	DECLARE granularity STRING DEFAULT "day";
	DECLARE types ARRAY <STRING>;
	set types = ['gecko-t/t-linux-xlarge', 'gecko-t/t-linux-large', 'gecko-t/t-win10-64', 'gecko-t/t-win10-64-gpu-s', 'gecko-t/t-win7-32', 'gecko-t/t-win7-32-gpu'];
	set types = ['gecko-t/t-win10-64-gpu-s'];

	SELECT
	orphaned.date,
	orphaned.workerPoolId,
	sum(orphaned.hours) as orphaned_hours,
	sum(failed_reboot.bill_hours) as bill_hours,
	sum(failed_reboot.worker_hours) as worker_hours,
	sum(orphaned.hours)+sum(failed_reboot.bill_hours)-(sum(failed_reboot.worker_hours)) as hours_orphaned
	FROM (
	SELECT
	failed_data.date as date,
	failed_data.workerPoolId as workerPoolId,
	sum(failed_data.bill_hours) as bill_hours,
	sum(worker_seconds)/3600 as worker_hours from (
	SELECT
	CASE
	WHEN granularity = 'day' THEN DATE_TRUNC(DATE(wm.timestamp), DAY)
	WHEN granularity = 'month' THEN DATE_TRUNC(DATE(wm.timestamp), MONTH)
	END as date,
	wm.workerPoolId,
	aws.hours as bill_hours,
	TIMESTAMP_DIFF(max(wm.timestamp), min(wm.timestamp), SECOND) as worker_seconds
	FROM
	taskclusteretl.worker_metrics as wm, taskclusteretl.tmp_aws_instance_hours as aws
	WHERE
	wm.workerPoolId in UNNEST(types)
	AND date(wm.timestamp)>=start_date
	AND date(wm.timestamp)<end_date
	AND aws.date>=start_date
	AND aws.date<end_date
	AND aws.workerPoolId in UNNEST(types)
	AND wm.eventType not like 'taskQueue%'
	AND wm.workerId in (
	SELECT instanceId from taskclusteretl.tmp_aws_instance_hours where workerPoolId in UNNEST(types) and date>=start_date and date<end_date and hours>=20)
	AND wm.workerId=aws.instanceId
	GROUP BY
	date,
	wm.workerPoolId,
	aws.hours
	ORDER BY
	date asc
	) as failed_data
	GROUP BY
	failed_data.date,
	failed_data.workerPoolId
	ORDER BY
	date asc
	) as failed_reboot, (
	SELECT
	date,
	workerPoolId,
	sum(hours) as hours
	FROM
	taskclusteretl.tmp_aws_instance_hours
	WHERE
	workerPoolId in UNNEST(types)
	AND date>=start_date
	AND date<end_date
	AND instanceId not in (
	SELECT workerId from taskclusteretl.worker_metrics where workerPoolId in UNNEST(types) and date(timestamp)>=start_date and date(timestamp)<end_date)
	GROUP BY
	date, workerPoolId
	ORDER BY
	date asc
	) as orphaned
	WHERE
	failed_reboot.date=orphaned.date
	AND failed_reboot.workerPoolId=orphaned.workerPoolId
	GROUP BY
	orphaned.date, orphaned.workerPoolId
	ORDER BY
	date asc