This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
export LIBJARS=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar,/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar | |
export HADOOP_CLASSPATH=/home/nuria/avro-kafka/camus-example-0.1.0-wmf6.jar:/home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar | |
/usr/bin/hadoop jar /home/nuria/avro-kafka/camus-wmf-0.1.0-wmf6.jar com.linkedin.camus.etl.kafka.CamusJob -libjars ${LIBJARS} -Dcamus.job.name="nuria_testing_avr | |
o" -P /home/nuria/avro-kafka/camus.avrotest.properties >> ./log_camus_avro_test.txt 2>&1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from kafka import KafkaConsumer | |
import avro.schema | |
import avro.io | |
import io | |
# To consume messages | |
consumer = KafkaConsumer('mediawiki_CirrusSearchRequestSet', | |
group_id='my_group', | |
metadata_broker_list=['kafka1012:9092']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# good 5 mins tutorial | |
http://www.elasticsearchtutorial.com/basic-elasticsearch-concepts.html | |
# deeper tutorial | |
http://exploringelasticsearch.com/ | |
# load a schema (I think this creates an index) | |
curl -XPOST 'http://localhost:9200/test/pageview/_mapping' --data "@/home/mforns/loading/schema.json" | |
# add a record |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// spark-shell --num-executors 16 --executor-cores 1 --executor-memory 256M --master yarn \ | |
// --jars /mnt/hdfs/wmf/refinery/2016-02-23T18.55.34Z--7dadb6b/artifacts/org/wikimedia/analytics/refinery/refinery-hive-0.0.26.jar | |
// --conf 'spark.executor.extraJavaOptions=-Dfile.encoding=UTF-8' | |
import sys.process._ | |
import org.wikimedia.analytics.refinery.core.PageviewDefinition; | |
def pt(path: String): String = { | |
val pageviewDefinition = PageviewDefinition.getInstance() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
Parses a Json Object. | |
The object will be traversed, and each leaf node of the object will | |
be keyed by a concatenated key made up of all parent keys. | |
**/ | |
function MetricLogster(reporter) { | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
month, | |
day, | |
SUM(CASE WHEN (user_agent LIKE '%iPhone%') THEN 1 ELSE 0 END) AS iphone, | |
SUM(CASE WHEN (user_agent LIKE '%iOS%') THEN 1 ELSE 0 END) AS iOS | |
FROM wmf.webrequest | |
WHERE webrequest_source = 'text' | |
AND year = 2016 | |
AND month IN (9, 10) | |
AND (user_agent like '%iOS%' OR user_agent like '%iPhone%') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# unique devices variation study using daily data | |
# we account for weekly variations | |
# and try to see when the number of uniques | |
# variates too much to be a quality meassurement | |
# see: https://wikitech.wikimedia.org/w/index.php?title=Analytics/Data_Lake/Traffic/Unique_Devices/Last_access_solution | |
from operator import itemgetter | |
from datetime import datetime | |
from datetime import date | |
import numpy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.wikimedia.analytics.refinery.tag; | |
import com.google.common.collect.ImmutableSet; | |
import com.google.common.reflect.ClassPath; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Set; | |
import org.reflections.*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type" : "record", | |
"name" : "AutoGeneratedSchema", | |
"doc" : "Sqoop import of QueryResult", | |
"fields" : [ { | |
"name" : "id", | |
"type" : [ "null", "int" ], | |
"default" : null, | |
"columnName" : "id", | |
"sqlType" : "4" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use wmf; | |
with hits as ( | |
SELECT | |
geocoded_data['country_code'] as country, | |
geocoded_data['country'] country_name, | |
SUM(CASE WHEN hostname NOT LIKE 'cp3%' AND hostname NOT LIKE 'amssq%' THEN 1 ELSE 0 END) AS hits_from_this_country_not_through_amsterdam, | |
SUM(CASE WHEN hostname LIKE 'cp3%' OR hostname LIKE 'amssq%' THEN 1 ELSE 0 END) AS hits_from_this_country_from_amsterdam | |
FROM wmf.webrequest | |
WHERE TRUE |