Skip to content

Instantly share code, notes, and snippets.

// spark2-shell --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar
/**
* Use RefineTarget.find to find all Refine targets for an input (camus job) in the last N hours.
* Then filter for any for which the _REFINED_FAILED flag exists.
*/
import import org.apache.hadoop.fs.Path
import org.joda.time.format.DateTimeFormatter
import com.github.nscala_time.time.Imports._
@nuria
nuria / event-streams.js
Last active November 26, 2019 19:27
Event Streams consumption
// This is the EventStreams RecentChange stream endpoint
var url = 'https://stream.wikimedia.org/v2/stream/recentchange';
// Use EventSource (available in most browsers, or as an
// npm module: https://www.npmjs.com/package/eventsource)
// to subscribe to the stream.
var recentChangeStream = new EventSource(url);
// Print each event to the console
recentChangeStream.onmessage = function(message) {
@nuria
nuria / test_segment_metadata.sh
Last active October 2, 2019 21:59
test_segment_metadata.sh
curl -X POST 'http://localhost:8082/druid/v2/?pretty' -H 'Content-Type:application/json' -H 'Accept:application/json' -d '{
"queryType":"segmentMetadata",
"dataSource":"wmf_netflow",
"intervals":["2019-09-01/2019-10-01"]
}'
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# From stat1004:
# pyspark2 --jars ~otto/spark-sql-kafka-0-10_2.11-2.3.1.jar,~otto/kafka-clients-1.1.0.jar
# Need spark-sql-kafka for DataStream source and kafka-clients for Kafka serdes.
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Declare a Spark schema that matches the JSONData.
# In a future MEP world this would be automatically loaded
# from a JSONSchema.
@nuria
nuria / select-events-per-day.hql
Created July 22, 2019 20:17
select events per day
select CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
count(1) as n_events
from event.externalguidance
where year=2019 and month=6
and not useragent.is_bot
and event.action = 'init'
group by year, month, day
order by date
limit 1000000
@nuria
nuria / calculate_entropy.py
Created February 12, 2019 04:58
Calculate entropy
#!/usr/bin/python
import sys
import math
f = sys.argv[1]
_file = open(f)
data = {}
@nuria
nuria / .virmrc
Last active September 17, 2019 21:31
.vimrc
set number
syntax enable
set cursorline
set showcmd
'show invisible chars'
set listchars=tab:→\ ,space:·,nbsp:␣,trail:•,eol:¶,precedes:«,extends:»
set list
-- make session dataset samller to be able to try things fast
--create table session_tryouts as select * from classifier_data_sorted a where a.sessionId in (select distinct s.sessionId from classifier_data_sorted s limit 100);
drop table if exists classifier_data_label;
create table
classifier_data_label
as
select
sessionId,
(unix_timestamp(max(ts)) - unix_timestamp( min(ts))) as length,
spark-submit --class org.wikimedia.analytics.refinery.job.AppSessionMetrics --master yarn --num-executors=6 --executor-cores=2 --executor-memory=2g /mnt/hdfs/tmp/nuria/jars/refinery-job-0
.0.10-SNAPSHOT.jar hdfs://analytics-hadoop/tmp/mobile-apps-sessions 2015 03 10