Skip to content

Instantly share code, notes, and snippets.

View milimetric's full-sized avatar

Dan Andreescu milimetric

  • Wikimedia Foundation
  • New York, NY
View GitHub Profile
#!/usr/local/bin/python
# calculates per country weekly percentiles
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats
@milimetric
milimetric / one-time-wikimetrics-upgrade
Last active August 29, 2015 13:58
updates the old report table results to make them work with the new UI
begin;
/* insert a RunReport for each AggregateReport, easiest way is to delete existing ones first */
delete from report where name like '<RunReport("%';
insert into report (name, show_in_ui, queue_result_key, result_key, created, user_id, status, parameters)
select 'RunReport Will Rename Below' as name,
0 as show_in_ui,
a.queue_result_key,
a.result_key,
select date, project, variant, page_title,
rank() over (PARTITION by date, project, variant ORDER BY tmp.pageviews desc) as rank
(select concat(year, lpad(month, 2, '0'), lpad(day, 2, '0')) as date,
project,
variant,
page_title,
sum(view_count) as pageviews
from wmf.pageview_hourly
where year = 2015
and month = 7
druid.host=localhost
druid.service=broker
druid.port=8000
druid.zk.service.host=localhost
# Change these to make Druid faster
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
@milimetric
milimetric / Partially Refining sampled logs in Hive
Created October 1, 2015 15:08
If you want to look at old sampled logs with the new refinery approach (UDFs, etc.) use this
ADD JAR /srv/deployment/analytics/refinery/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION client_ip as 'org.wikimedia.analytics.refinery.hive.ClientIpUDF';
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF';
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF';
CREATE TEMPORARY FUNCTION classify_referer AS 'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF';
CREATE TEMPORARY FUNCTION is_wikimedia_bot as 'org.wikimedia.analytics.refinery.hive.IsWikimediaBotUDF';
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF';
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF';
# 20 days in minutes / 10 minutes = 2880 chunks
# 1 day in minutes / 10 minutes = 144 chunks
use log;
set @start = '20151014050000';
select seq.tenMinuteChunk
from (select left(date_format(date_add(@start, interval seq*10 minute), '%Y%m%d%H%i'), 11) as tenMinuteChunk
from seq_0_to_144
@milimetric
milimetric / distinct-wikis-for-one-hour-of-pageviews
Created November 17, 2015 20:59
To help with finding the correct project name for the WMF Pageview API, here is a list of all distinct projects taken from one hour of pageviews.
aa.wikibooks
aa.wikipedia
aa.wiktionary
ab.wikipedia
ace.wikipedia
af.wikibooks
af.wikipedia
af.wikiquote
af.wiktionary
ak.wikipedia
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Column, Integer
engine = create_engine('mysql://test:test@localhost/test')
meta = MetaData()
table = sqlalchemy.Table(
'test_table',
meta,
Column('test_col', Integer),
@milimetric
milimetric / .gitignore
Last active December 15, 2015 12:49
Which skin do Wikipedia Editors use? Only look at editors with 5 or more edits over the past 30 days.
*swp
def get_udp2log_ports():
"""Returns the listen ports of running udp2log processes"""
pattern = "/usr/bin/udp2log"
return [get_p(cmd) for cmd in [get_cmd(pid) for pid in iter_pids()] if has_p(pattern, cmd)]
def has_p(pattern, cmd):
return pattern in cmd[0] and '-p' in cmd
def get_p(cmd):
return int(cmd[cmd.index('-p') + 1])