Skip to content

Instantly share code, notes, and snippets.

@milimetric
Created September 20, 2016 21:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milimetric/e77e22a736cef4c973a26667a3e94d8c to your computer and use it in GitHub Desktop.
Save milimetric/e77e22a736cef4c973a26667a3e94d8c to your computer and use it in GitHub Desktop.
This is a step-by-step query / command guide to getting pageviews for a particular wiki project. We will productionize this and make it part of the pageview API.
-- Amir's query to get all pages in WikiProject_Medicine, executed on analytics-store
select page_title
from (select tl_from, rd_from
from templatelinks
left join
redirect on rd_from = tl_from
and rd_title = 'WikiProject_Medicine'
and (rd_interwiki is null or rd_interwiki = '')
and rd_namespace = 10
inner join
page on tl_from = page_id
where tl_namespace = 10
and tl_title = 'WikiProject_Medicine'
order by tl_from
) temp_backlink_range
inner join
page on tl_from = page_id
where page_namespace = 1
order by page_title
;
-- output to a file on stat1003 by:
/*
mysql --defaults-file=/etc/mysql/conf.d/research-client.cnf -hanalytics-store.eqiad.wmnet -Denwiki -s < t.sql > wiki-project-medicine-pages.tsv
*/
-- scp-ed to stat1004
-- put in hadoop with:
-- hdfs dfs -put wiki-project-medicine-pages.tsv /user/milimetric/wiki-project-medicine-pages/
-- create a hive table:
/*
create external table milimetric.wikiproject_medicine_page
(page_title string)
location 'hdfs://analytics-hadoop/user/milimetric/wiki-project-medicine-pages'
;
*/
-- and create another hive table for the intermediary counts per page:
/*
create external table milimetric.wikiproject_medicine_page_counts
(
page_title string,
view_count bigint,
year int,
month int
)
location 'hdfs://analytics-hadoop/user/milimetric/wiki-project-medicine-page-counts'
;
*/
-- select total and top pageviews for these articles
insert into milimetric.wikiproject_medicine_page_counts
select p.page_title,
sum(coalesce(w.view_count, 0)) as view_count,
w.year,
w.month
from milimetric.wikiproject_medicine_page p
inner join
wmf.pageview_hourly w on w.page_title = p.page_title
where w.year=2016
and w.month in (7, 8)
group by p.page_title, year, month
;
-- example ways to get data out:
-- top 100 pages in August:
select page_title,
view_count
from milimetric.wikiproject_medicine_page_counts
where year=2016
and month=8
order by view_count desc
limit 100
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment