Skip to content

Instantly share code, notes, and snippets.

@aq
Created February 10, 2023 16:38
Show Gist options
  • Save aq/056c1248e906b184a1106368274044c6 to your computer and use it in GitHub Desktop.
Save aq/056c1248e906b184a1106368274044c6 to your computer and use it in GitHub Desktop.
-- Test the use of Caffeine in UDFs used by refine_webrequest
use aqu ;
CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest2`(
`hostname` string COMMENT 'Source node hostname',
`sequence` bigint COMMENT 'Per host sequence number',
`dt` string COMMENT 'Timestame at cache in ISO 8601',
`time_firstbyte` double COMMENT 'Time to first byte',
`ip` string COMMENT 'IP of packet at cache',
`cache_status` string COMMENT 'Cache status',
`http_status` string COMMENT 'HTTP status of response',
`response_size` bigint COMMENT 'Response size',
`http_method` string COMMENT 'HTTP method of request',
`uri_host` string COMMENT 'Host of request',
`uri_path` string COMMENT 'Path of request',
`uri_query` string COMMENT 'Query of request',
`content_type` string COMMENT 'Content-Type header of response',
`referer` string COMMENT 'Referer header of request',
`x_forwarded_for` string COMMENT 'X-Forwarded-For header of request (deprecated)',
`user_agent` string COMMENT 'User-Agent header of request',
`accept_language` string COMMENT 'Accept-Language header of request',
`x_analytics` string COMMENT 'X-Analytics header of response',
`range` string COMMENT 'Range header of response',
`is_pageview` boolean COMMENT 'Indicates if this record was marked as a pageview during refinement',
`record_version` string COMMENT 'Keeps track of changes in the table content definition - https://wikitech.wikimedia.org/wiki/Analytics/Data/Webrequest',
`client_ip` string COMMENT 'Client IP - DEPRECATED - Same as IP.',
`geocoded_data` map<string, string> COMMENT 'Geocoded map with continent, country_code, country, city, subdivision, postal_code, latitude, longitude, timezone keys and associated values.',
-- Waiting for x_cache format to change before parsing into a map
`x_cache` string COMMENT 'X-Cache header of response',
-- Next two fields are to replace original ua and x_analytics ones.
-- However such schema modification implies backward incompatibility.
-- We will replace once we feel confident enough that 'every' backward incompatible change is done.
`user_agent_map` map<string, string> COMMENT 'User-agent map with browser_family, browser_major, device_family, os_family, os_major, os_minor and wmf_app_version keys and associated values',
`x_analytics_map` map<string, string> COMMENT 'X_analytics map view of the x_analytics field',
`ts` timestamp COMMENT 'Unix timestamp in milliseconds extracted from dt',
`access_method` string COMMENT 'Method used to access the site (mobile app|mobile web|desktop)',
`agent_type` string COMMENT 'Categorise the agent making the webrequest as either user or spider (automatas to be added).',
`is_zero` boolean COMMENT 'NULL as zero program is over',
`referer_class` string COMMENT 'Indicates if a referer is internal, external or unknown.',
`normalized_host` struct<project_class: string, project:string, qualifiers: array<string>, tld: String, project_family: string> COMMENT 'struct containing project_family (such as wikipedia or wikidata for instance), project (such as en or commons), qualifiers (a list of in-between values, such as m) and tld (org most often)',
`pageview_info` map<string, string> COMMENT 'map containing project, language_variant and page_title values only when is_pageview = TRUE.',
`page_id` bigint COMMENT 'MediaWiki page_id for this page title. For redirects this could be the page_id of the redirect or the page_id of the target. This may not always be set, even if the page is actually a pageview.',
`namespace_id` int COMMENT 'MediaWiki namespace_id for this page title. This may not always be set, even if the page is actually a pageview.',
`tags` array<string> COMMENT 'List containing tags qualifying the request, ex: [portal, wikidata]. Will be used to split webrequest into smaller subsets.',
`isp_data` map<string, string> COMMENT 'Internet Service Provider data in a map with keys isp, organization, autonomous_system_organization and autonomous_system_number',
`accept` string COMMENT 'Accept header of request',
`tls` string COMMENT 'TLS information of request',
`tls_map` map<string, string> COMMENT 'Map view of TLS information (keys are vers, keyx, auth and ciph)',
`ch_ua` string COMMENT 'Value of the Sec-CH-UA request header',
`ch_ua_mobile` string COMMENT 'Value of the Sec-CH-UA-Mobile request header',
`ch_ua_platform` string COMMENT 'Value of the Sec-CH-UA-Platform request header',
`ch_ua_arch` string COMMENT 'Value of the Sec-CH-UA-Arch request header',
`ch_ua_bitness` string COMMENT 'Value of the Sec-CH-UA-Bitness request header',
`ch_ua_full_version_list` string COMMENT 'Value of the Sec-CH-UA-Full-Version-List request header',
`ch_ua_model` string COMMENT 'Value of the Sec-CH-UA-Model request header',
`ch_ua_platform_version` string COMMENT 'Value of the Sec-CH-UA-Platform-Version request header'
)
PARTITIONED BY (
`webrequest_source` string COMMENT 'Source cluster',
`year` int COMMENT 'Unpadded year of request',
`month` int COMMENT 'Unpadded month of request',
`day` int COMMENT 'Unpadded day of request',
`hour` int COMMENT 'Unpadded hour of request'
)
STORED AS PARQUET
LOCATION 'hdfs://analytics-hadoop/user/aqu/webrequest2'
;
ADD JAR /home/aqu/analytics-refinery-source/refinery-hive/target/refinery-hive-0.2.11-SNAPSHOT-shaded.jar;
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF';
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF';
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF';
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF';
CREATE TEMPORARY FUNCTION referer_classify AS 'org.wikimedia.analytics.refinery.hive.SmartReferrerClassifierUDF';
CREATE TEMPORARY FUNCTION get_pageview_info AS 'org.wikimedia.analytics.refinery.hive.GetPageviewInfoUDF';
CREATE TEMPORARY FUNCTION normalize_host AS 'org.wikimedia.analytics.refinery.hive.HostNormalizerUDF';
CREATE TEMPORARY FUNCTION get_tags AS 'org.wikimedia.analytics.refinery.hive.GetWebrequestTagsUDF';
CREATE TEMPORARY FUNCTION isp_data as 'org.wikimedia.analytics.refinery.hive.GetISPDataUDF';
SET spark.sql.shuffle.partitions = 256;
WITH distinct_rows AS (
SELECT DISTINCT
hostname,
sequence,
dt,
time_firstbyte,
ip,
cache_status,
http_status,
response_size,
http_method,
uri_host,
uri_path,
uri_query,
content_type,
referer,
x_forwarded_for,
user_agent,
accept_language,
x_analytics,
`range`,
x_cache,
accept,
tls,
ch_ua,
ch_ua_mobile,
ch_ua_platform,
ch_ua_arch,
ch_ua_bitness,
ch_ua_full_version_list,
ch_ua_model,
ch_ua_platform_version
FROM
wmf_raw.webrequest
WHERE
webrequest_source='text' AND
year=2023 AND month=2 AND day=10 AND hour=0
),
distinct_rows_and_reused_fields AS (
SELECT
hostname,
sequence,
dt,
time_firstbyte,
ip,
cache_status,
http_status,
response_size,
http_method,
uri_host,
uri_path,
uri_query,
content_type,
referer,
x_forwarded_for,
user_agent,
accept_language,
x_analytics,
`range`,
x_cache,
accept,
tls,
-- Materialize reused computed fields
is_pageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as is_pageview,
ua_parser(user_agent) as user_agent_map,
CASE COALESCE(x_analytics, '-')
WHEN '-' THEN NULL
ELSE str_to_map(x_analytics, '\;', '=')
END as x_analytics_map,
ch_ua,
ch_ua_mobile,
ch_ua_platform,
ch_ua_arch,
ch_ua_bitness,
ch_ua_full_version_list,
ch_ua_model,
ch_ua_platform_version
FROM distinct_rows
)
INSERT OVERWRITE TABLE aqu.webrequest2
PARTITION(webrequest_source='text',year=2023,month=2,day=10,hour=0)
SELECT
hostname,
sequence,
dt,
time_firstbyte,
ip,
cache_status,
http_status,
response_size,
http_method,
uri_host,
uri_path,
uri_query,
content_type,
referer,
x_forwarded_for,
user_agent,
accept_language,
x_analytics,
`range`,
is_pageview,
'${record_version}' as record_version,
ip as client_ip,
geocoded_data(ip) as geocoded_data,
x_cache,
user_agent_map,
x_analytics_map,
CAST(unix_timestamp(dt, "yyyy-MM-dd'T'HH:mm:ssX") * 1.0 as timestamp) as ts,
get_access_method(uri_host, user_agent) as access_method,
CASE
WHEN ((user_agent_map['device_family'] = 'Spider') OR (is_spider(user_agent))) THEN 'spider'
ELSE 'user'
END as agent_type,
NULL as is_zero,
referer_classify(referer) as referer_class,
normalize_host(uri_host) as normalized_host,
CASE
WHEN is_pageview THEN get_pageview_info(uri_host, uri_path, uri_query)
ELSE NULL
END as pageview_info,
CAST(x_analytics_map['page_id'] AS BIGINT) as page_id,
CAST(x_analytics_map['ns'] AS BIGINT) as namespace_id,
get_tags(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as tags,
isp_data(ip) as isp_data,
accept,
tls,
CASE COALESCE(tls, '-')
WHEN '-' THEN NULL
ELSE str_to_map(tls, '\;', '=')
END as tls_map,
ch_ua,
ch_ua_mobile,
ch_ua_platform,
ch_ua_arch,
ch_ua_bitness,
ch_ua_full_version_list,
ch_ua_model,
ch_ua_platform_version
FROM distinct_rows_and_reused_fields
;
-- Let's compare The new dataset and the former one.
select count(1)
from aqu.webrequest2 w2
where w2.webrequest_source='text' and w2.year = 2023 and w2.month=2 and w2.day=10 and w2.hour=0;
-- 261_861_061
select count(1)
from wmf.webrequest
where webrequest_source='text' and year = 2023 and month=2 and day=10 and hour=0;
-- 261_861_061
-- And now let's test the result of the UDFs
select count(1) as t
from aqu.webrequest2 w2
left outer join wmf.webrequest w
using(webrequest_source, year, month, day, hour, hostname, sequence, dt)
where
w2.webrequest_source='text' and w2.year = 2023 and w2.month=2 and w2.day=10 and w2.hour=0
and w.is_pageview <> w2.is_pageview
and cast(w.user_agent_map as string) <> cast(w2.user_agent_map as string)
and cast(w.x_analytics_map as string) <> cast(w2.x_analytics_map as string)
and cast(w.geocoded_data as string) <> cast(w2.geocoded_data as string)
and w.access_method <> w2.access_method
and w.agent_type <> w2.agent_type
and w.referer_class <> w2.referer_class
and w.normalized_host <> w2.normalized_host
and w.page_id <> w2.page_id
and w.namespace_id <> w2.namespace_id
and w.tags <> w2.tags
and cast(w.isp_data as string) <> cast(w2.isp_data as string)
and cast(w.tls_map as string) <> cast(w2.tls_map as string);
-- => 0 Good!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment