Skip to content

Instantly share code, notes, and snippets.

@Ironholds
Created December 15, 2014 16:24
Show Gist options
  • Save Ironholds/96558613fe38dd4d1961 to your computer and use it in GitHub Desktop.
Save Ironholds/96558613fe38dd4d1961 to your computer and use it in GitHub Desktop.
query
SET hive.exec.compress.output=true;
SET whitelisted_mediawiki_projects = 'commons', 'meta', 'incubator', 'species';
CREATE TABLE ironholds.pageviews_sample_test(qualifier STRING, count_views INT);
INSERT OVERWRITE TABLE ironholds.pageviews_sample_test
SELECT
CONCAT(sub1.language_and_site, sub1.project_suffix) qualifier,
COUNT(*) count_views
FROM (
SELECT
regexp_extract(uri_host, '^([A-Za-z0-9-]+(\\.(zero|m))?)\\.[a-z]*\\.org$') language_and_site,
CASE regexp_extract(uri_host, '\\.(wik(ipedia|ibooks|tionary|imediafoundation|imedia|inews|iquote|isource|iversity|ivoyage|idata)|mediawiki)\\.org$')
WHEN 'wikipedia' THEN ''
WHEN 'wikibooks' THEN '.b'
WHEN 'wiktionary' THEN '.d'
WHEN 'wikimediafoundation' THEN '.f'
WHEN 'wikimedia' THEN '.m'
WHEN 'wikinews' THEN '.n'
WHEN 'wikiquote' THEN '.q'
WHEN 'wikisource' THEN '.s'
WHEN 'wikiversity' THEN '.v'
WHEN 'wikivoyage' THEN '.voy'
WHEN 'mediawiki' THEN '.w'
WHEN 'wikidata' THEN '.wd'
ELSE NULL END project_suffix
FROM wmf_raw.webrequest
WHERE webrequest_source IN ('text', 'mobile')
AND year=2014
AND month=12
AND day=7
AND hour=12
AND (content_type IN('text/html\; charset=iso-8859-1','text/html\; charset=ISO-8859-1','text/html','text/html\; charset=utf-8','text/html\; charset=UTF-8') AND uri_path NOT LIKE ('api.php')) OR (content_type = 'application/json' AND user_agent LIKE('WikipediaApp') AND uri_path LIKE('sections=0'))
AND uri_path NOT RLIKE('(BannerRandom|CentralAutoLogin|ZeroRatedMobileAccess|UserLogin|MobileEditor)')
AND uri_path RLIKE('(/zh(-(tw|cn|hant|mo|my|hans|hk|sg))?/|/sr(-(ec|el))?/|/wiki(/|\\?(cur|old)id=)|/w/|/\\?title=)')
AND http_status = '200'
AND uri_path NOT LIKE('undefined')
AND uri_path NOT LIKE('Undefined')
) sub1
WHERE sub1.language_and_site != ''
AND sub1.project_suffix IS NOT NULL
AND (
sub1.project_suffix != '.m'
OR SPLIT(sub1.language_and_site, '\\.')[0] IN (
${hiveconf:whitelisted_mediawiki_projects}
)
)
GROUP BY sub1.language_and_site, sub1.project_suffix
LIMIT 100000000;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment