Skip to content

Instantly share code, notes, and snippets.

#!/usr/local/bin/python
# calculates per country weekly percentiles
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats
#!/usr/local/bin/python
# Reads a file like <count>, raw browser string
# and processses it to output:
# percentage, normalized browser string
import sys
import md5
from ua_parser import user_agent_parser
# beautify ua
@nuria
nuria / UaCalculator
Last active August 29, 2015 14:14
Calculates user agent percentages
#!/usr/lib/python
# read file
# File format is:
# {"browser_major":"1","os_family":"Android","os_major":"1","device_family":"Opus One","browser_family":"Android","os_minor":"5"} 5
# {"browser_major":"1","os_family":"Android","os_major":"4","device_family":"icube 900","browser_family":"Baidu Explorer","os_minor":"2"} 1
# hash and store values
# loop over values and
@nuria
nuria / cat_file_to_tcp_endpoint
Last active August 29, 2015 14:15
Cats a file to a tcp endpoint line by line using streams and zeromq
#!/usr/local/bin
import zmq
import io
import time
import sys
import re
# reads line by line a file and sends it
# to a tcp endpoint using zeromq
# handy to cat big files to a listener
@nuria
nuria / gist:2f5e33902122870e44e0
Created March 25, 2015 17:36
/var/lib/git/operations/puppet/manifests/misc/limn1.pp
node 'limn1.eqiad.wmflabs' {
include webserver::apache
# make sure /var/log/apache2 is readable by wikidevs for debugging.
# This won't make the actual log files readable, only the directory.
# Individual log files can be created and made readable by
# classes that manage individual sites.
file { '/var/log/apache2':
ensure => 'directory',
owner => 'root',
@nuria
nuria / gist:e0b0d4a702cdd45f6d37
Last active August 29, 2015 14:17
VCL cookie setting, time maniulation
C{
#include<stdio.h>
#include <time.h>
#include <string.h>
#define vcl_string char
char* get_expiration() {
struct tm str_time;
time_t time_of_day;
char expiration[100];
@nuria
nuria / testing_udf.hql
Last active September 16, 2015 18:29
Testing a udf
add jar /home/nuria/workplace/refinery/source/refinery-core/target/refinery-core-0.0.19-SNAPSHOT.jar;
add jar /home/nuria/workplace/refinery/source/refinery-hive/target/refinery-hive-0.0.19-SNAPSHOT.jar;
CREATE TEMPORARY FUNCTION isPageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION isAppPageview as 'org.wikimedia.analytics.refinery.hive.IsAppPageviewUDF';
use wmf;
select isPageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent) from webrequest where year=2015 and month=09 and day=04 and hour=01;
--call hive like this leaving the hive.aux.jars.path
>hive --hiveconf hive.aux.jars.path= -f test-udf.hql
-- geocoded data on webrequest is like:
-- {"city":"Unknown","country_code":"--","longitude":"-1","postal_code":"Unknown","timezone":"Unknown","subdivision":"Unknown","continent":"Unknown","latitude":"-1","country":"Unknown"}
-- find records where by city we have less than 10 unique IPs
use wmf;
select wr1.client_ip, geocoded_data["city"] from webrequest as wr1 where year=2015 and month=09 and hour=01
and wr1.client_ip in (select wr2.client_ip from webrequest wr2 where year=2015 and month=09 and hour=01 group by wr2.client_ip having count(*) <10);
@nuria
nuria / gist:833fef6a74574125a3fc
Last active September 30, 2015 20:59
Add a third party lib to map reduce job
#!/bin/sh
export LIBJARS=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar,/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
export HADOOP_CLASSPATH=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar:/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
/usr/bin/hadoop jar /home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar com.linkedin.camus.etl.kafka.CamusJob -libjars ${LIBJARS} -Dcamus.job.name="nuria_testing_avr
o" -P /home/nuria/avro-kafka/camus.avrotest.properties >> ./log_camus_avro_test.txt 2>&1
@nuria
nuria / gist:01fef56a8a69528fee93
Created October 26, 2015 22:47
consume from kafka
from kafka import KafkaConsumer
import avro.schema
import avro.io
import io
# To consume messages
consumer = KafkaConsumer('mediawiki_CirrusSearchRequestSet',
group_id='my_group',
metadata_broker_list=['kafka1012:9092'])