Skip to content

Instantly share code, notes, and snippets.

@sunahsuh
Created November 12, 2019 20:57
Show Gist options
  • Save sunahsuh/4236fd4072c0edc73dbf765d52576a51 to your computer and use it in GitHub Desktop.
Save sunahsuh/4236fd4072c0edc73dbf765d52576a51 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
from google.cloud import bigquery
import re
import requests
import os
import json
from redash_client.client import RedashClient
import csv
DATASET = 'telemetry'
TABLE_IDS = [
'clients_last_seen_v1',
'anonymous',
'clients_last_seen',
'firefox_desktop_exact_mau28',
'certificate_checker',
'system_addon_deployment_diagnostics',
'active_profiles_v1',
'addon_install_blocked',
'bhr',
'sync',
'sync_log_device_counts_v1',
'eng_workflow_hgpush_parquet_v1',
'ssl_ratios_v1',
'sync_flat_summary',
'smoot_usage_desktop_v1',
'firefox_accounts_exact_mau28_by_dimensions',
'crash_aggregates_v1',
'tls13_middlebox_repetition',
'main_summary_v4',
'pre_account',
'experiment_error_aggregates',
'fxa_content_events',
'sync_log_device_activity',
'telemetry_core_parquet',
'first_shutdown',
'core_clients_last_seen',
'firefox_nondesktop_exact_mau28_by_product',
'fxa_users_daily_v1',
'clients_daily',
'fxa_users_last_seen_v1_outgoing',
'modules',
'smoot_clients_daily_1percent_v1',
'remote_content_uptake',
'firefox_accounts_exact_mau28_v1',
'telemetry_heartbeat_parquet_v1',
'optout',
'main',
'disable_sha1rollout',
'sync_log_v1',
'crash_summary_v1',
'smoot_usage_all',
'lockwise_mobile_events_v1',
'smoot_usage_all_mtr_v1',
'clients_daily_scalar_aggregates_v1',
'ftu',
'sync_events',
'experiments',
'fenix_events_amplitude_v1',
'firefox_accounts_exact_mau28_by_dimensions_v1',
'firefox_kpi_dashboard_v1',
'smoot_metrics',
'smoot_usage_fxa_raw_v1',
'core_clients_daily',
'fxa_auth_bounce_events',
'firefox_accounts_exact_mau28_raw_v1',
'first_shutdown_summary',
'downgrade',
'smoot_usage_nondesktop_v1',
'health',
'crash_summary',
'smoot_usage_2week_raw_v1',
'telemetry_downgrade_parquet',
'telemetry_new_profile_parquet',
'searchvolextra',
'prio',
'fxa_content_auth_events',
'glean_clients_last_seen_v1',
'nondesktop_clients_last_seen_v1',
'crash_aggregates',
'telemetry_focus_event_parquet',
'telemetry_core_parquet_v3',
'main_summary',
'deployment_checker',
'voice',
'eng_workflow_hgpush_parquet',
'fxa_content_auth_events_v1',
'telemetry_anonymous_parquet',
'event',
'smoot_usage_desktop',
'shield_study_addon',
'smoot_clients_last_seen_1percent_cleaned_v1',
'testpilottest',
'experiments_aggregates',
'advancedtelemetry',
'tls_13_study_v1',
'retention',
'heartbeat',
'sync_log_device_counts',
'smoot_nondesktop_usage',
'mobile_metrics',
'firefox_kpi_dashboard',
'firefox_desktop_exact_mau28_by_dimensions_v1',
'retention_v1',
'socorro_crash',
'tls_13_study',
'tls13_middlebox_alt_server_hello_1',
'smoot_usage_metrics_raw_v1',
'android_anr_report',
'smoot_desktop_usage_raw_v1',
'telemetry_ip_privacy_parquet_v1',
'events_v1',
'firefox_nondesktop_exact_mau28',
'fxa_users_last_seen',
'shield_study_error',
'new_profile',
'telemetry_focus_event_parquet_v1',
'firefox_desktop_exact_mau28_v1',
'smoot_usage_v1',
'smoot_nondesktop_usage_v1',
'fxa_users_last_seen_v1',
'smoot_all_usage_v1',
'addons',
'firefox_nondesktop_exact_mau28_v1',
'addon_aggregates',
'tls_13_study_v2',
'firefox_nondesktop_exact_mau28_raw_v1',
'fxa_content_events_v1',
'update',
'smoot_desktop_0week_v1',
'events',
'churn_v3',
'smoot_usage_1week_raw_v1',
'smoot_desktop_usage',
'tls13_middlebox_draft22',
'fxa_users_last_seen_raw_v1',
'smoot_all_usage',
'firefox_nondesktop_exact_mau28_by_dimensions_v1',
'smoot_clients_daily_1percent',
'sync_flat_summary_v1',
'experiments_v1',
'shield_study',
'experiment_error_aggregates_v1',
'sync_events_v1',
'smoot_usage',
'sync_summary_v2',
'socorro_crash_v2',
'mobile_event',
'fxa_users_daily',
'addons_v2',
'focus_event',
'sync_summary',
'smoot_usage_fxa_v1',
'sync_log',
'clients_daily_v6',
'active_profiles',
'remote_content_uptake_v1',
'clients_profile_per_install_affected',
'shield_icq_v1',
'experiments_aggregates_v1',
'sync_bmk_total_per_day_v1',
'telemetry_heartbeat_parquet',
'crash',
'telemetry_shield_study_parquet_v1',
'fenix_events_v1',
'glean_clients_daily_v1',
'core_clients_daily_v1',
'telemetry_shield_study_parquet',
'fxa_auth_events',
'addon_aggregates_v2',
'smoot_usage_all_v1',
'telemetry_ip_privacy_parquet',
'client_probe_counts_v2',
'telemetry_downgrade_parquet_v1',
'smoot_usage_nondesktop_raw_v1',
'smoot_clients_last_seen_1percent_raw_v1',
'firefox_desktop_exact_mau28_by_dimensions',
'firefox_nondesktop_exact_mau28_by_product_v1',
'churn_v2',
'clients_daily_scalar_aggregates_v2',
'sync_bmk_total_per_day',
'telemetry_mobile_event_parquet',
'first_shutdown_summary_v4',
'fxa_all_events_v1',
'pioneer_study',
'clients_last_seen_raw_v1',
'lockwise_mobile_events',
'main_summary_v3',
'tls13_middlebox_beta',
'client_probe_counts_v1',
'smoot_nondesktop_usage_raw_v1',
'eng_workflow_build_parquet_v1',
'fxa_auth_bounce_events_v1',
'core_clients_last_seen_v1',
'flash_shield_study',
'block_autoplay',
'churn',
'tls13_middlebox_testing',
'searchvol',
'firefox_nondesktop_exact_mau28_by_dimensions',
'uitour_tag',
'ssl_ratios',
'fxa_auth_events_v1',
'smoot_desktop_0week',
'deletion',
'telemetry_mobile_event_parquet_v2',
'tls13_middlebox_ghack',
'smoot_clients_last_seen_1percent_cleaned',
'malware_addon_states',
'crash_summary_v2',
'fxa_all_events',
'glean_clients_last_seen_raw_v1',
'smoot_desktop_usage_v1',
'telemetry_new_profile_parquet_v2',
'core',
'smoot_clients_last_seen_1percent',
'core_clients_last_seen_raw_v1',
'sync_log_device_activity_v1',
'smoot_metrics_v1',
'tls_13_study_v4',
'saved_session',
'testpilot',
'smoot_clients_last_seen_1percent_v1',
'telemetry_anonymous_parquet_v1',
'outofdate_notifications_system_addon',
'untrusted_modules',
'tls_13_study_v3',
'smoot_usage_desktop_raw_v1',
'eng_workflow_build_parquet',
'clients_daily_histogram_aggregates_v1',
'smoot_usage_nondesktop',
'frecency_update',
'x_contextual_feature_recommendation'
]
SEARCH_DATASET = "search"
SEARCH_TABLE_IDS = [
"search_aggregates",
"search_aggregates_dev_v3",
"search_aggregates_v3",
"search_aggregates_v4",
"search_aggregates_v5",
"search_aggregates_v6",
"search_aggregates_v7",
"search_clients_daily",
"search_clients_daily_v3",
"search_clients_daily_v4",
"search_clients_daily_v5",
"search_clients_daily_v6",
"search_clients_daily_v7",
]
UDF_DATASET = "udf_legacy"
UDFS = [
"contains",
"date_format",
"to_iso8601",
"date_trunc",
]
substitutions = [
# Add dataset to telemetry dataset table names
(f"(^|\\s+)({'|'.join(TABLE_IDS)})(\\s+)", f"\\1{DATASET}.\\2\\3"),
# Add dataset to search dataset table names
(f"(^|\\s+)({'|'.join(SEARCH_TABLE_IDS)})(\\s+)", f"\\1{SEARCH_DATASET}.\\2\\3"),
# MMMMYYDD to MMMM-YY-DD
(r"20(\d\d)(\d\d)(\d\d)", r"20\1-\2-\3"),
# submission_date_s3 => submission_date
(r"submission_date_s3", r"submission_date"),
# sample_id string to int
(r"sample_id ([<=>]+) '(\d+)'", r"sample_id \1 \2"),
# remove ARRAY from array declarations
(r"(?i)ARRAY\s*\[", r"["),
# DOUBLE or FLOAT -> FLOAT64
(r"(?i)AS (DOUBLE|FLOAT)", "AS FLOAT64"),
# INTEGER -> INT64
(r"(?i)AS (INT|INTEGER|BIGINT|SMALLINT|TINYINT)\b", "AS INT64"),
# VARCHAR or CHAR -> STRING
(r"(?i)AS (VARCHAR|CHAR)", "AS STRING"),
# Common date parsing no longer necessary (note: keeping this after
# the submission_date_s3 sub means we catch more cases since some
# tables use submission_date as the date field
(r"(?i)DATE_PARSE\(submission_date,\s*'%Y%m%d'\)", "submission_date"),
# Similar to above, current date -> string is no longer necessary
(r"(?i)DATE_FORMAT\(current_date,\s*'%Y%m%d'\)", "CURRENT_DATE"),
# Map Access -> get_key
(r"([\w\.]+)\[('[\w-]+?')\]", r"udf.get_key(\1, \2)"),
# approx_distinct -> approx_count_distinct
(r"(?i)APPROX_DISTINCT\(", "APPROX_COUNT_DISTINCT("),
# TRY_CAST -> SAFE_CAST
(r"(?i)TRY_CAST\(", "SAFE_CAST("),
# element_at -> udf.get_key
(r"(?i)ELEMENT_AT\(", "udf.get_key("),
# use UDFs for some built-in functions
(f"(?i)\\b({'|'.join(UDFS)})\(", f"{UDF_DATASET}.\\1("),
# a few instances of this pattern can be simplified
(r"(?i)reduce\(map_values\(searches\), 0, \(x, y\) -> x \+ y, s -> s\)",
"(SELECT SUM(value) from UNNEST(searches))"),
# simple date_add invocations (addition)
(r"(?i)date_add\('(\w+)',\s*(\d+),\s*(\w+)\)",
"DATE_ADD(\\3, INTERVAL \\2 \\1)"),
# simple date_add invocations (subtraction)
(r"(?i)date_add\('(\w+)',\s*-(\d+),\s*(\w+)\)",
"DATE_SUB(\\3, INTERVAL \\2 \\1)"),
# UNIX_DATE built-in replaces more verbose version
("to_unixtime(current_date)/86400", "UNIX_DATE(CURRENT_DATE)"),
# UNION in bq requires either ALL or DISTINCT keywords
(r"\bUNION\b", "UNION ALL"),
# bare INTERVAL addition/subtraction translated to date_add/date_sub
(r"(?i)(\w+)\s+\s+INTERVAL '(\d+)'\s+(\w+)", r"DATE_ADD(\1, INTERVAL \2 \3)"),
(r"(?i)(\w+)\s-\s+INTERVAL '(\d+)'\s+(\w+)", r"DATE_SUB(\1, INTERVAL \2 \3)"),
]
def translate(text):
subs = [(re.compile(r[0]), r[1]) for r in substitutions]
for pattern, replacement in subs:
text = re.sub(pattern, replacement, text)
return text
BQ_DATASOURCE_ID = 63
with open('/Users/ssuh/dev/mozilla/scratch/query_ids_batch_2.txt', 'r') as f:
api_key = os.environ["REDASH_API_KEY"]
redash_client = RedashClient(api_key)
headers = {"Authorization": f"Key {api_key}"}
bq_client = bigquery.Client(project='moz-fx-data-derived-datasets')
results = []
for query_id in f:
q_id = query_id.strip()
new_query = redash_client.fork_query(q_id)
new_details = requests.get(
f"https://sql.telemetry.mozilla.org/api/queries/{new_query['id']}",
headers=headers
).json()
transformed = (
"/* This query was auto-transformed for BigQuery from "
f"https://sql.telemetry.mozilla.org/queries/{q_id} */\n\n"
+ translate(new_details['query'])
).encode('ascii', 'ignore').decode('ascii')
job_config = bigquery.QueryJobConfig()
job_config.dry_run = True
job_config.use_query_cache = False
query_job = bq_client.query(transformed)
redash_client.update_query(new_details['id'], new_details['name'],
transformed, BQ_DATASOURCE_ID,
new_details.get('description'))
"""
text = requests.get(
f"https://sql.telemetry.mozilla.org/api/queries/{query_id}",
headers=headers
).json()['query']
transformed = (
"/* This query was auto-transformed for BigQuery from "
f"https://sql.telemetry.mozilla.org/queries/{query_id} */\n\n"
+ translate(text)
)
job_config = bigquery.QueryJobConfig()
job_config.dry_run = True
job_config.use_query_cache = False
query_job = bq_client.query(transformed)
"""
obj = {
'origId': q_id,
'newId': new_details['id'],
'orig': new_details['query'],
'transformed': transformed,
'errorResult': query_job.error_result
}
results.append(obj)
print(json.dumps(results))
with open('/Users/ssuh/dev/mozilla/scratch/new_queries_batch_2.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=results[0].keys())
writer.writeheader()
for r in results:
writer.writerow(r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment