This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Test the use of Caffeine in UDFs used by refine_webrequest | |
use aqu ; | |
CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest2`( | |
`hostname` string COMMENT 'Source node hostname', | |
`sequence` bigint COMMENT 'Per host sequence number', | |
`dt` string COMMENT 'Timestame at cache in ISO 8601', | |
`time_firstbyte` double COMMENT 'Time to first byte', | |
`ip` string COMMENT 'IP of packet at cache', | |
`cache_status` string COMMENT 'Cache status', | |
`http_status` string COMMENT 'HTTP status of response', | |
`response_size` bigint COMMENT 'Response size', | |
`http_method` string COMMENT 'HTTP method of request', | |
`uri_host` string COMMENT 'Host of request', | |
`uri_path` string COMMENT 'Path of request', | |
`uri_query` string COMMENT 'Query of request', | |
`content_type` string COMMENT 'Content-Type header of response', | |
`referer` string COMMENT 'Referer header of request', | |
`x_forwarded_for` string COMMENT 'X-Forwarded-For header of request (deprecated)', | |
`user_agent` string COMMENT 'User-Agent header of request', | |
`accept_language` string COMMENT 'Accept-Language header of request', | |
`x_analytics` string COMMENT 'X-Analytics header of response', | |
`range` string COMMENT 'Range header of response', | |
`is_pageview` boolean COMMENT 'Indicates if this record was marked as a pageview during refinement', | |
`record_version` string COMMENT 'Keeps track of changes in the table content definition - https://wikitech.wikimedia.org/wiki/Analytics/Data/Webrequest', | |
`client_ip` string COMMENT 'Client IP - DEPRECATED - Same as IP.', | |
`geocoded_data` map<string, string> COMMENT 'Geocoded map with continent, country_code, country, city, subdivision, postal_code, latitude, longitude, timezone keys and associated values.', | |
-- Waiting for x_cache format to change before parsing into a map | |
`x_cache` string COMMENT 'X-Cache header of response', | |
-- Next two fields are to replace original ua and x_analytics ones. | |
-- However such schema modification implies backward incompatibility. | |
-- We will replace once we feel confident enough that 'every' backward incompatible change is done. | |
`user_agent_map` map<string, string> COMMENT 'User-agent map with browser_family, browser_major, device_family, os_family, os_major, os_minor and wmf_app_version keys and associated values', | |
`x_analytics_map` map<string, string> COMMENT 'X_analytics map view of the x_analytics field', | |
`ts` timestamp COMMENT 'Unix timestamp in milliseconds extracted from dt', | |
`access_method` string COMMENT 'Method used to access the site (mobile app|mobile web|desktop)', | |
`agent_type` string COMMENT 'Categorise the agent making the webrequest as either user or spider (automatas to be added).', | |
`is_zero` boolean COMMENT 'NULL as zero program is over', | |
`referer_class` string COMMENT 'Indicates if a referer is internal, external or unknown.', | |
`normalized_host` struct<project_class: string, project:string, qualifiers: array<string>, tld: String, project_family: string> COMMENT 'struct containing project_family (such as wikipedia or wikidata for instance), project (such as en or commons), qualifiers (a list of in-between values, such as m) and tld (org most often)', | |
`pageview_info` map<string, string> COMMENT 'map containing project, language_variant and page_title values only when is_pageview = TRUE.', | |
`page_id` bigint COMMENT 'MediaWiki page_id for this page title. For redirects this could be the page_id of the redirect or the page_id of the target. This may not always be set, even if the page is actually a pageview.', | |
`namespace_id` int COMMENT 'MediaWiki namespace_id for this page title. This may not always be set, even if the page is actually a pageview.', | |
`tags` array<string> COMMENT 'List containing tags qualifying the request, ex: [portal, wikidata]. Will be used to split webrequest into smaller subsets.', | |
`isp_data` map<string, string> COMMENT 'Internet Service Provider data in a map with keys isp, organization, autonomous_system_organization and autonomous_system_number', | |
`accept` string COMMENT 'Accept header of request', | |
`tls` string COMMENT 'TLS information of request', | |
`tls_map` map<string, string> COMMENT 'Map view of TLS information (keys are vers, keyx, auth and ciph)', | |
`ch_ua` string COMMENT 'Value of the Sec-CH-UA request header', | |
`ch_ua_mobile` string COMMENT 'Value of the Sec-CH-UA-Mobile request header', | |
`ch_ua_platform` string COMMENT 'Value of the Sec-CH-UA-Platform request header', | |
`ch_ua_arch` string COMMENT 'Value of the Sec-CH-UA-Arch request header', | |
`ch_ua_bitness` string COMMENT 'Value of the Sec-CH-UA-Bitness request header', | |
`ch_ua_full_version_list` string COMMENT 'Value of the Sec-CH-UA-Full-Version-List request header', | |
`ch_ua_model` string COMMENT 'Value of the Sec-CH-UA-Model request header', | |
`ch_ua_platform_version` string COMMENT 'Value of the Sec-CH-UA-Platform-Version request header' | |
) | |
PARTITIONED BY ( | |
`webrequest_source` string COMMENT 'Source cluster', | |
`year` int COMMENT 'Unpadded year of request', | |
`month` int COMMENT 'Unpadded month of request', | |
`day` int COMMENT 'Unpadded day of request', | |
`hour` int COMMENT 'Unpadded hour of request' | |
) | |
STORED AS PARQUET | |
LOCATION 'hdfs://analytics-hadoop/user/aqu/webrequest2' | |
; | |
ADD JAR /home/aqu/analytics-refinery-source/refinery-hive/target/refinery-hive-0.2.11-SNAPSHOT-shaded.jar; | |
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF'; | |
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF'; | |
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF'; | |
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF'; | |
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF'; | |
CREATE TEMPORARY FUNCTION referer_classify AS 'org.wikimedia.analytics.refinery.hive.SmartReferrerClassifierUDF'; | |
CREATE TEMPORARY FUNCTION get_pageview_info AS 'org.wikimedia.analytics.refinery.hive.GetPageviewInfoUDF'; | |
CREATE TEMPORARY FUNCTION normalize_host AS 'org.wikimedia.analytics.refinery.hive.HostNormalizerUDF'; | |
CREATE TEMPORARY FUNCTION get_tags AS 'org.wikimedia.analytics.refinery.hive.GetWebrequestTagsUDF'; | |
CREATE TEMPORARY FUNCTION isp_data as 'org.wikimedia.analytics.refinery.hive.GetISPDataUDF'; | |
SET spark.sql.shuffle.partitions = 256; | |
WITH distinct_rows AS ( | |
SELECT DISTINCT | |
hostname, | |
sequence, | |
dt, | |
time_firstbyte, | |
ip, | |
cache_status, | |
http_status, | |
response_size, | |
http_method, | |
uri_host, | |
uri_path, | |
uri_query, | |
content_type, | |
referer, | |
x_forwarded_for, | |
user_agent, | |
accept_language, | |
x_analytics, | |
`range`, | |
x_cache, | |
accept, | |
tls, | |
ch_ua, | |
ch_ua_mobile, | |
ch_ua_platform, | |
ch_ua_arch, | |
ch_ua_bitness, | |
ch_ua_full_version_list, | |
ch_ua_model, | |
ch_ua_platform_version | |
FROM | |
wmf_raw.webrequest | |
WHERE | |
webrequest_source='text' AND | |
year=2023 AND month=2 AND day=10 AND hour=0 | |
), | |
distinct_rows_and_reused_fields AS ( | |
SELECT | |
hostname, | |
sequence, | |
dt, | |
time_firstbyte, | |
ip, | |
cache_status, | |
http_status, | |
response_size, | |
http_method, | |
uri_host, | |
uri_path, | |
uri_query, | |
content_type, | |
referer, | |
x_forwarded_for, | |
user_agent, | |
accept_language, | |
x_analytics, | |
`range`, | |
x_cache, | |
accept, | |
tls, | |
-- Materialize reused computed fields | |
is_pageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as is_pageview, | |
ua_parser(user_agent) as user_agent_map, | |
CASE COALESCE(x_analytics, '-') | |
WHEN '-' THEN NULL | |
ELSE str_to_map(x_analytics, '\;', '=') | |
END as x_analytics_map, | |
ch_ua, | |
ch_ua_mobile, | |
ch_ua_platform, | |
ch_ua_arch, | |
ch_ua_bitness, | |
ch_ua_full_version_list, | |
ch_ua_model, | |
ch_ua_platform_version | |
FROM distinct_rows | |
) | |
INSERT OVERWRITE TABLE aqu.webrequest2 | |
PARTITION(webrequest_source='text',year=2023,month=2,day=10,hour=0) | |
SELECT | |
hostname, | |
sequence, | |
dt, | |
time_firstbyte, | |
ip, | |
cache_status, | |
http_status, | |
response_size, | |
http_method, | |
uri_host, | |
uri_path, | |
uri_query, | |
content_type, | |
referer, | |
x_forwarded_for, | |
user_agent, | |
accept_language, | |
x_analytics, | |
`range`, | |
is_pageview, | |
'${record_version}' as record_version, | |
ip as client_ip, | |
geocoded_data(ip) as geocoded_data, | |
x_cache, | |
user_agent_map, | |
x_analytics_map, | |
CAST(unix_timestamp(dt, "yyyy-MM-dd'T'HH:mm:ssX") * 1.0 as timestamp) as ts, | |
get_access_method(uri_host, user_agent) as access_method, | |
CASE | |
WHEN ((user_agent_map['device_family'] = 'Spider') OR (is_spider(user_agent))) THEN 'spider' | |
ELSE 'user' | |
END as agent_type, | |
NULL as is_zero, | |
referer_classify(referer) as referer_class, | |
normalize_host(uri_host) as normalized_host, | |
CASE | |
WHEN is_pageview THEN get_pageview_info(uri_host, uri_path, uri_query) | |
ELSE NULL | |
END as pageview_info, | |
CAST(x_analytics_map['page_id'] AS BIGINT) as page_id, | |
CAST(x_analytics_map['ns'] AS BIGINT) as namespace_id, | |
get_tags(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as tags, | |
isp_data(ip) as isp_data, | |
accept, | |
tls, | |
CASE COALESCE(tls, '-') | |
WHEN '-' THEN NULL | |
ELSE str_to_map(tls, '\;', '=') | |
END as tls_map, | |
ch_ua, | |
ch_ua_mobile, | |
ch_ua_platform, | |
ch_ua_arch, | |
ch_ua_bitness, | |
ch_ua_full_version_list, | |
ch_ua_model, | |
ch_ua_platform_version | |
FROM distinct_rows_and_reused_fields | |
; | |
-- Let's compare The new dataset and the former one. | |
select count(1) | |
from aqu.webrequest2 w2 | |
where w2.webrequest_source='text' and w2.year = 2023 and w2.month=2 and w2.day=10 and w2.hour=0; | |
-- 261_861_061 | |
select count(1) | |
from wmf.webrequest | |
where webrequest_source='text' and year = 2023 and month=2 and day=10 and hour=0; | |
-- 261_861_061 | |
-- And now let's test the result of the UDFs | |
select count(1) as t | |
from aqu.webrequest2 w2 | |
left outer join wmf.webrequest w | |
using(webrequest_source, year, month, day, hour, hostname, sequence, dt) | |
where | |
w2.webrequest_source='text' and w2.year = 2023 and w2.month=2 and w2.day=10 and w2.hour=0 | |
and w.is_pageview <> w2.is_pageview | |
and cast(w.user_agent_map as string) <> cast(w2.user_agent_map as string) | |
and cast(w.x_analytics_map as string) <> cast(w2.x_analytics_map as string) | |
and cast(w.geocoded_data as string) <> cast(w2.geocoded_data as string) | |
and w.access_method <> w2.access_method | |
and w.agent_type <> w2.agent_type | |
and w.referer_class <> w2.referer_class | |
and w.normalized_host <> w2.normalized_host | |
and w.page_id <> w2.page_id | |
and w.namespace_id <> w2.namespace_id | |
and w.tags <> w2.tags | |
and cast(w.isp_data as string) <> cast(w2.isp_data as string) | |
and cast(w.tls_map as string) <> cast(w2.tls_map as string); | |
-- => 0 Good! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment