Skip to content

Instantly share code, notes, and snippets.

View milimetric's full-sized avatar

Dan Andreescu milimetric

  • Wikimedia Foundation
  • New York, NY
View GitHub Profile
@milimetric
milimetric / differ.py
Last active December 25, 2015 00:28
python to diff two lists of datetime strings, each with their own format
from datetime import datetime
def diff_datewise(left, right, left_format=None, right_format=None):
"""
Parameters
left : a list of datetime strings or objects
right : a list of datetime strings or objects
left_format : None if left contains datetimes, or strptime format
right_format : None if right contains datetimes, or strptime format
@milimetric
milimetric / categoryEdgeList.sql
Last active December 25, 2015 07:09
Category Edge List for MediaWiki content
select p1.page_title as child_title
,p1.page_id as child_id
,p2.page_title as parent_title
,p2.page_id as parent_id
from categorylinks cl
inner join
page p1 on p1.page_id = cl.cl_from
inner join
page p2 on p2.page_title = cl.cl_to
and p2.page_id <> cl.cl_from
*swp
@milimetric
milimetric / redis.conf
Created December 11, 2013 17:01
wikimetrics configuration for redis
daemonize yes
pidfile /var/run/redis.pid
port 6379
timeout 0
loglevel debug
logfile /var/log/redis/redis-server.log
databases 16
save 900 1
save 300 10
save 60 20
#!/usr/local/bin/python
# calculates per country weekly percentiles
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats
@milimetric
milimetric / one-time-wikimetrics-upgrade
Last active August 29, 2015 13:58
updates the old report table results to make them work with the new UI
begin;
/* insert a RunReport for each AggregateReport, easiest way is to delete existing ones first */
delete from report where name like '<RunReport("%';
insert into report (name, show_in_ui, queue_result_key, result_key, created, user_id, status, parameters)
select 'RunReport Will Rename Below' as name,
0 as show_in_ui,
a.queue_result_key,
a.result_key,
# 20 days in minutes / 10 minutes = 2880 chunks
# 1 day in minutes / 10 minutes = 144 chunks
use log;
set @start = '20151014050000';
select seq.tenMinuteChunk
from (select left(date_format(date_add(@start, interval seq*10 minute), '%Y%m%d%H%i'), 11) as tenMinuteChunk
from seq_0_to_144
druid.host=localhost
druid.service=broker
druid.port=8000
druid.zk.service.host=localhost
# Change these to make Druid faster
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
select date, project, variant, page_title,
rank() over (PARTITION by date, project, variant ORDER BY tmp.pageviews desc) as rank
(select concat(year, lpad(month, 2, '0'), lpad(day, 2, '0')) as date,
project,
variant,
page_title,
sum(view_count) as pageviews
from wmf.pageview_hourly
where year = 2015
and month = 7
@milimetric
milimetric / Partially Refining sampled logs in Hive
Created October 1, 2015 15:08
If you want to look at old sampled logs with the new refinery approach (UDFs, etc.) use this
ADD JAR /srv/deployment/analytics/refinery/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION client_ip as 'org.wikimedia.analytics.refinery.hive.ClientIpUDF';
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF';
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF';
CREATE TEMPORARY FUNCTION classify_referer AS 'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF';
CREATE TEMPORARY FUNCTION is_wikimedia_bot as 'org.wikimedia.analytics.refinery.hive.IsWikimediaBotUDF';
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF';
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF';