nuria nuria

## Calculate Weekly Percentiles Per Country
#!/usr/local/bin/python
# calculates per country weekly percentiles
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats

## gist:9af8c6edbf44becbc6ac
#!/usr/local/bin/python
# Reads a file like <count>, raw browser string
# and processses it to output:
# percentage, normalized browser string

import sys
import md5
from ua_parser import user_agent_parser

# beautify ua

## UaCalculator
#!/usr/lib/python

# read file
# File format is:

# {"browser_major":"1","os_family":"Android","os_major":"1","device_family":"Opus One","browser_family":"Android","os_minor":"5"} 5
# {"browser_major":"1","os_family":"Android","os_major":"4","device_family":"icube 900","browser_family":"Baidu Explorer","os_minor":"2"} 1

# hash and store values
# loop over values and

## cat_file_to_tcp_endpoint
#!/usr/local/bin
import zmq
import io
import time
import sys
import re

# reads line by line a file and sends it
# to a tcp endpoint using zeromq
# handy to cat big files to a listener

## gist:2f5e33902122870e44e0
node 'limn1.eqiad.wmflabs' {
    include webserver::apache

    # make sure /var/log/apache2 is readable by wikidevs for debugging.
    # This won't make the actual log files readable, only the directory.
    # Individual log files can be created and made readable by
    # classes that manage individual sites.
    file { '/var/log/apache2':
        ensure  => 'directory',
        owner   => 'root',

## gist:e0b0d4a702cdd45f6d37
C{
		#include<stdio.h>
		#include <time.h>
		#include <string.h>
		#define vcl_string char

	char* get_expiration() {
    		struct tm str_time;
    		time_t time_of_day;
    		char expiration[100];

## testing_udf.hql
add jar /home/nuria/workplace/refinery/source/refinery-core/target/refinery-core-0.0.19-SNAPSHOT.jar;
add jar /home/nuria/workplace/refinery/source/refinery-hive/target/refinery-hive-0.0.19-SNAPSHOT.jar;

CREATE TEMPORARY FUNCTION isPageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION isAppPageview as 'org.wikimedia.analytics.refinery.hive.IsAppPageviewUDF';
use wmf;
select isPageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent) from webrequest where year=2015 and month=09 and day=04 and hour=01;

--call hive like this leaving the hive.aux.jars.path
>hive  --hiveconf hive.aux.jars.path= -f test-udf.hql

## find_ips_with_less_than_10_ocurrences
-- geocoded data on webrequest is like:
-- {"city":"Unknown","country_code":"--","longitude":"-1","postal_code":"Unknown","timezone":"Unknown","subdivision":"Unknown","continent":"Unknown","latitude":"-1","country":"Unknown"}
-- find records where by city we have less than 10 unique IPs

use wmf;


select wr1.client_ip, geocoded_data["city"] from webrequest as wr1  where  year=2015 and month=09 and hour=01
and wr1.client_ip in (select wr2.client_ip from webrequest wr2 where year=2015 and month=09 and hour=01 group by wr2.client_ip having count(*) <10);

## gist:833fef6a74574125a3fc
#!/bin/sh
export LIBJARS=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar,/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
export HADOOP_CLASSPATH=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar:/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
/usr/bin/hadoop jar   /home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar com.linkedin.camus.etl.kafka.CamusJob -libjars ${LIBJARS}  -Dcamus.job.name="nuria_testing_avr
o"  -P /home/nuria/avro-kafka/camus.avrotest.properties   >>  ./log_camus_avro_test.txt 2>&1

## gist:01fef56a8a69528fee93
from kafka import KafkaConsumer
import avro.schema
import avro.io
import io

# To consume messages
consumer = KafkaConsumer('mediawiki_CirrusSearchRequestSet',
                         group_id='my_group',
                         metadata_broker_list=['kafka1012:9092'])
	#!/usr/local/bin/python
	# calculates per country weekly percentiles
	#
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.mlab as mlab
	import matplotlib.dates as md
	import csv
	import datetime
	from scipy import stats
	#!/usr/local/bin/python
	# Reads a file like <count>, raw browser string
	# and processses it to output:
	# percentage, normalized browser string

	import sys
	import md5
	from ua_parser import user_agent_parser

	# beautify ua
	#!/usr/lib/python

	# read file
	# File format is:

	# {"browser_major":"1","os_family":"Android","os_major":"1","device_family":"Opus One","browser_family":"Android","os_minor":"5"} 5
	# {"browser_major":"1","os_family":"Android","os_major":"4","device_family":"icube 900","browser_family":"Baidu Explorer","os_minor":"2"} 1

	# hash and store values
	# loop over values and
	#!/usr/local/bin
	import zmq
	import io
	import time
	import sys
	import re

	# reads line by line a file and sends it
	# to a tcp endpoint using zeromq
	# handy to cat big files to a listener
	node 'limn1.eqiad.wmflabs' {
	include webserver::apache

	# make sure /var/log/apache2 is readable by wikidevs for debugging.
	# This won't make the actual log files readable, only the directory.
	# Individual log files can be created and made readable by
	# classes that manage individual sites.
	file { '/var/log/apache2':
	ensure => 'directory',
	owner => 'root',
	C{
	#include<stdio.h>
	#include <time.h>
	#include <string.h>
	#define vcl_string char

	char* get_expiration() {
	struct tm str_time;
	time_t time_of_day;
	char expiration[100];
	add jar /home/nuria/workplace/refinery/source/refinery-core/target/refinery-core-0.0.19-SNAPSHOT.jar;
	add jar /home/nuria/workplace/refinery/source/refinery-hive/target/refinery-hive-0.0.19-SNAPSHOT.jar;

	CREATE TEMPORARY FUNCTION isPageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
	CREATE TEMPORARY FUNCTION isAppPageview as 'org.wikimedia.analytics.refinery.hive.IsAppPageviewUDF';
	use wmf;
	select isPageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent) from webrequest where year=2015 and month=09 and day=04 and hour=01;

	--call hive like this leaving the hive.aux.jars.path
	>hive --hiveconf hive.aux.jars.path= -f test-udf.hql
	-- geocoded data on webrequest is like:
	-- {"city":"Unknown","country_code":"--","longitude":"-1","postal_code":"Unknown","timezone":"Unknown","subdivision":"Unknown","continent":"Unknown","latitude":"-1","country":"Unknown"}
	-- find records where by city we have less than 10 unique IPs

	use wmf;


	select wr1.client_ip, geocoded_data["city"] from webrequest as wr1 where year=2015 and month=09 and hour=01
	and wr1.client_ip in (select wr2.client_ip from webrequest wr2 where year=2015 and month=09 and hour=01 group by wr2.client_ip having count(*) <10);
	#!/bin/sh
	export LIBJARS=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar,/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
	export HADOOP_CLASSPATH=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar:/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar
	/usr/bin/hadoop jar /home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar com.linkedin.camus.etl.kafka.CamusJob -libjars ${LIBJARS} -Dcamus.job.name="nuria_testing_avr
	o" -P /home/nuria/avro-kafka/camus.avrotest.properties >> ./log_camus_avro_test.txt 2>&1
	from kafka import KafkaConsumer
	import avro.schema
	import avro.io
	import io

	# To consume messages
	consumer = KafkaConsumer('mediawiki_CirrusSearchRequestSet',
	group_id='my_group',
	metadata_broker_list=['kafka1012:9092'])