Skip to content

Instantly share code, notes, and snippets.

@sunahsuh
Created August 30, 2019 16:29
Show Gist options
  • Save sunahsuh/40d250b542890e17967a38cb283d39ba to your computer and use it in GitHub Desktop.
Save sunahsuh/40d250b542890e17967a38cb283d39ba to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
from google.cloud import bigquery
import re
DATASET = 'telemetry'
TABLE_IDS = [
'active_profiles',
'active_profiles_v1',
'addon_install_blocked',
'addons',
'addons_aggregates',
'addons_aggregates_v2',
'addons_v2',
'advancedtelemetry',
'android_anr_report',
'anonymous',
'bhr',
'block_autoplay',
'certificate_checker',
'churn',
'churn_v2',
'churn_v3',
'client_probe_counts_v1',
'client_probe_counts_v2',
'clients_aggregates_v1',
'clients_aggregates_v2',
'clients_daily',
'clients_daily_histogram_aggregates_v1',
'clients_daily_histogram_aggregates_v2',
'clients_daily_keyed_scalar_aggregates_v1',
'clients_daily_scalar_aggregates_v1',
'clients_daily_v6',
'clients_last_seen',
'clients_last_seen_raw_v1',
'clients_last_seen_v1',
'clients_profile_per_install_affected',
'core',
'core_clients_daily',
'core_clients_daily_v1',
'core_clients_last_seen',
'core_clients_last_seen_raw_v1',
'core_clients_last_seen_v1',
'crash',
'crash_aggregates',
'crash_aggregates_v1',
'crash_summary',
'crash_summary_v1',
'crash_summary_v2',
'deletion',
'deployment_checker',
'disable_sha1rollout',
'downgrade',
'eng_workflow_build_parquet',
'eng_workflow_build_parquet_v1',
'eng_workflow_hgpush_parquet',
'eng_workflow_hgpush_parquet_v1',
'event',
'events',
'events_v1',
'experiment_error_aggregates',
'experiment_error_aggregates_v1',
'experiments',
'experiments_aggregates',
'experiments_aggregates_v1',
'experiments_v1',
'fenix_events_amplitude_v1',
'fenix_events_v1',
'firefox_accounts_exact_mau28_by_dimensions',
'firefox_accounts_exact_mau28_by_dimensions_v1',
'firefox_accounts_exact_mau28_raw_v1',
'firefox_accounts_exact_mau28_v1',
'firefox_desktop_exact_mau28',
'firefox_desktop_exact_mau28_by_dimensions',
'firefox_desktop_exact_mau28_by_dimensions_v1',
'firefox_desktop_exact_mau28_v1',
'firefox_kpi_dashboard',
'firefox_kpi_dashboard_v1',
'firefox_nondesktop_exact_mau28',
'firefox_nondesktop_exact_mau28_by_dimensions',
'firefox_nondesktop_exact_mau28_by_dimensions_v1',
'firefox_nondesktop_exact_mau28_by_product',
'firefox_nondesktop_exact_mau28_by_product_v1',
'firefox_nondesktop_exact_mau28_raw_v1',
'firefox_nondesktop_exact_mau28_v1',
'first_shutdown',
'first_shutdown_summary',
'first_shutdown_summary_v4',
'flash_shield_study',
'focus_event',
'frecency_update',
'ftu',
'fxa_all_events',
'fxa_all_events_v1',
'fxa_auth_bounce_events',
'fxa_auth_bounce_events_v1',
'fxa_auth_events',
'fxa_auth_events_v1',
'fxa_content_auth_events',
'fxa_content_auth_events_v1',
'fxa_content_events',
'fxa_content_events_v1',
'fxa_users_daily',
'fxa_users_daily_v1',
'fxa_users_last_seen',
'fxa_users_last_seen_raw_v1',
'fxa_users_last_seen_v1',
'fxa_users_last_seen_v1_outgoing',
'glean_clients_daily_v1',
'glean_clients_last_seen_raw_v1',
'glean_clients_last_seen_v1',
'health',
'heartbeat',
'lockwise_mobile_events',
'lockwise_mobile_events_v1',
'main',
'main_summary',
'main_summary_v3',
'main_summary_v4',
'malware_addon_states',
'mobile_event',
'mobile_metrics',
'modules',
'new_profile',
'nondesktop_clients_last_seen_v1',
'optout',
'outofdate_notifications_system_addon',
'pioneer_study',
'pre_account',
'prio',
'remote_content_uptake',
'remote_content_uptake_v1',
'retention',
'retention_v1',
'saved_session',
'searchvol',
'searchvolextra',
'shield_icq_v1',
'shield_study',
'shield_study_addon',
'shield_study_error',
'smoot_all_usage',
'smoot_all_usage_v1',
'smoot_clients_daily_1percent',
'smoot_clients_daily_1percent_v1',
'smoot_clients_last_seen_1percent',
'smoot_clients_last_seen_1percent_cleaned',
'smoot_clients_last_seen_1percent_cleaned_v1',
'smoot_clients_last_seen_1percent_raw_v1',
'smoot_clients_last_seen_1percent_v1',
'smoot_desktop_0week',
'smoot_desktop_0week_v1',
'smoot_desktop_usage',
'smoot_desktop_usage_raw_v1',
'smoot_desktop_usage_v1',
'smoot_metrics',
'smoot_metrics_v1',
'smoot_nondesktop_usage',
'smoot_nondesktop_usage_raw_v1',
'smoot_nondesktop_usage_v1',
'smoot_usage',
'smoot_usage_1week_raw_v1',
'smoot_usage_2week_raw_v1',
'smoot_usage_all',
'smoot_usage_all_mtr_v1',
'smoot_usage_all_v1',
'smoot_usage_desktop',
'smoot_usage_desktop_raw_v1',
'smoot_usage_desktop_v1',
'smoot_usage_fxa_raw_v1',
'smoot_usage_fxa_v1',
'smoot_usage_metrics_raw_v1',
'smoot_usage_nondesktop',
'smoot_usage_nondesktop_raw_v1',
'smoot_usage_nondesktop_v1',
'smoot_usage_v1',
'socorro_crash',
'socorro_crash_v2',
'ssl_ratios',
'ssl_ratios_v1',
'sync',
'sync_events',
'sync_events_v1',
'sync_flat_summary',
'sync_flat_summary_v1',
'sync_log',
'sync_log_device_activity',
'sync_log_device_activity_v1',
'sync_log_device_counts',
'sync_log_device_counts_v1',
'sync_log_v1',
'sync_summary',
'sync_summary_v2',
'system_addon_deployment_diagnostics',
'telemetry_anonymous_parquet',
'telemetry_anonymous_parquet_v1',
'telemetry_core_parquet',
'telemetry_core_parquet_v3',
'telemetry_downgrade_parquet',
'telemetry_downgrade_parquet_v1',
'telemetry_focus_event_parquet',
'telemetry_focus_event_parquet_v1',
'telemetry_heartbeat_parquet',
'telemetry_heartbeat_parquet_v1',
'telemetry_ip_privacy_parquet',
'telemetry_ip_privacy_parquet_v1',
'telemetry_mobile_event_parquet',
'telemetry_mobile_event_parquet_v2',
'telemetry_new_profile_parquet',
'telemetry_new_profile_parquet_v2',
'telemetry_shield_study_parquet',
'telemetry_shield_study_parquet_v1',
'testpilot',
'testpilottest',
'tls13_middlebox_alt_server_hello_1',
'tls13_middlebox_beta',
'tls13_middlebox_draft22',
'tls13_middlebox_ghack',
'tls13_middlebox_repetition',
'tls13_middlebox_testing',
'tls_13_study',
'tls_13_study_v1',
'tls_13_study_v2',
'tls_13_study_v3',
'tls_13_study_v4',
'uitour_tag',
'untrusted_modules',
'update',
'x_contextual_feature_recommendation'
]
UDF_DATASET = "udf_legacy"
UDFS = [
"contains",
"date_format",
"to_iso8601"
]
substitutions = [
# Add dataset to table names
(f"(^|\\s+)({'|'.join(TABLE_IDS)})\\b", f"\\1{DATASET}.\\2"),
# MMMMYYDD to MMMM-YY-DD
(r"20(\d\d)(\d\d)(\d\d)", r"20\1-\2-\3"),
# submission_date_s3 => submission_date
(r"submission_date_s3", r"submission_date"),
# sample_id string to int
(r"sample_id ([<=>]+) '(\d+)'", r"sample_id \1 \2"),
# remove ARRAY from array declarations
(r"(?i)ARRAY\s*\[", r"["),
# DOUBLE or FLOAT -> FLOAT64
(r"(?i)AS (DOUBLE|FLOAT)", "AS FLOAT64"),
# INTEGER -> INT64
(r"(?i)AS (INT|INTEGER|BIGINT|SMALLINT|TINYINT)\b", "AS INT64"),
# VARCHAR or CHAR -> STRING
(r"(?i)AS (VARCHAR|CHAR)", "AS STRING"),
# Common date parsing no longer necessary (note: keeping this after
# the submission_date_s3 sub means we catch more cases since some
# tables use submission_date as the date field
(r"(?i)DATE_PARSE\(submission_date, '%Y%m%d'\)", "submission_date"),
# Map Access -> get_key
(r"([\w\.]+)\[('[\w]+?')\]", r"udf.get_key(\1, \2)"),
# approx_distinct -> approx_count_distinct
(r"(?i)APPROX_DISTINCT\(", "APPROX_COUNT_DISTINCT("),
# TRY_CAST -> SAFE_CAST
(r"(?i)TRY_CAST\(", "SAFE_CAST("),
# element_at -> udf.get_key
(r"(?i)ELEMENT_AT\(", "udf.get_key("),
# use UDFs for some built-in functions
(f"(?i)\\b({'|'.join(UDFS)})\(", r"{UDF_DATASET}.\1("),
]
def translate(text):
client = bigquery.Client(project='moz-fx-data-derived-datasets')
subs = [(re.compile(r[0]), r[1]) for r in substitutions]
for pattern, replacement in subs:
text = re.sub(pattern, replacement, text)
return text
with open('/Users/ssuh/dev/mozilla/scratch/queries.txt', 'r') as f:
q = f.read()
print(translate(q))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment