Skip to content

Instantly share code, notes, and snippets.

@milimetric
Created February 22, 2019 16:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milimetric/7e8614d21550c0c23d86945ff463817b to your computer and use it in GitHub Desktop.
Save milimetric/7e8614d21550c0c23d86945ff463817b to your computer and use it in GitHub Desktop.
example query to mess around with
use wmf;
-- new data
select coalesce(c.country, g.country_code) as country,
sum(edit_count) as edits,
sum(namespace_zero_edit_count) as namespace_zero_edits
from geoeditors_edits_monthly g
inner join
(select distinct dbname
from wmf_raw.mediawiki_project_namespace_map
where hostname like '%.wikipedia.%'
and snapshot='2019-01'
) w on w.dbname = g.wiki_db
left join
(select distinct country, country_code
from pageview_hourly
where year=2019 and month=2 and day=10 and hour = 15
) c on g.country_code = c.country_code
where month in ('2018-11', '2018-12')
group by g.country_code, c.country
--having sum(edit_count) >= 16000
order by country
limit 100000
;
-- new data lower bound based on editors
select country,
sum(if(activity_level = '1 to 4', distinct_editors, 0)) +
(5 * sum(if(activity_level = '5 to 99', distinct_editors, 0))) +
(100 * sum(if(activity_level = '100 or more', distinct_editors, 0))) as lower_bound
from geoeditors_monthly g
left join
(select distinct country, country_code
from pageview_hourly
where year=2019 and month=2 and day=10 and hour = 15
) c on g.country_code = c.country_code
where month in ('2018-11', '2018-12')
group by country
having sum(distinct_editors) >= 1000
order by country
limit 1000
;
-- playing around with an example of 'AL' and different constants to see the estimate
select sum(edits) from geoeditors_edits_monthly where country_code = 'AL' and month='2018-12';
select (3 * sum(if(activity_level = '1 to 4', distinct_editors, 0))) +
(10 * sum(if(activity_level = '5 to 99', distinct_editors, 0))) +
(110 * sum(if(activity_level = '100 or more', distinct_editors, 0))) as lower_bound
from geoeditors_monthly g
where month in ('2018-12')
and country_code = 'AL'
;
-- raw data deleted after 90 days is:
in geoeditors_daily
-- old data
select country,
sum(edits) edits
from geowiki_archive_edits_country
where start like '2017-%-01'
group by country
having sum(edits) >= 40000
order by country
limit 100000
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment