nuria nuria

## FailedRefineTargets.scala
// spark2-shell --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar

/**
 * Use RefineTarget.find to find all Refine targets for an input (camus job) in the last N hours.
 * Then filter for any for which the _REFINED_FAILED flag exists.
 */

import import org.apache.hadoop.fs.Path
import org.joda.time.format.DateTimeFormatter
import com.github.nscala_time.time.Imports._

## event-streams.js
// This is the EventStreams RecentChange stream endpoint
var url = 'https://stream.wikimedia.org/v2/stream/recentchange';

// Use EventSource (available in most browsers, or as an
// npm module: https://www.npmjs.com/package/eventsource)
// to subscribe to the stream.
var recentChangeStream = new EventSource(url);

// Print each event to the console
recentChangeStream.onmessage = function(message) {

## test_segment_metadata.sh
curl -X POST  'http://localhost:8082/druid/v2/?pretty'  -H 'Content-Type:application/json' -H 'Accept:application/json' -d '{
  "queryType":"segmentMetadata",
  "dataSource":"wmf_netflow",
  "intervals":["2019-09-01/2019-10-01"]

    }'

## redirects.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                nuria
                / redirects.ipynb
            
            
              Created
              September 20, 2019 14:52
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## spark-streaming.py
# From stat1004:
# pyspark2 --jars ~otto/spark-sql-kafka-0-10_2.11-2.3.1.jar,~otto/kafka-clients-1.1.0.jar
# Need spark-sql-kafka for DataStream source and kafka-clients for Kafka serdes.

from pyspark.sql.functions import *
from pyspark.sql.types import *

# Declare a Spark schema that matches the JSONData.
# In a future MEP world this would be automatically loaded
# from a JSONSchema.

## select-events-per-day.hql
select CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
count(1) as n_events
from event.externalguidance
where year=2019 and month=6
and not useragent.is_bot
and event.action = 'init'
group by year, month, day
order by date
limit 1000000

## calculate_entropy.py
#!/usr/bin/python

import sys
import math

f = sys.argv[1]

_file = open(f)

data = {}

## .virmrc
set number
syntax enable
set cursorline
set showcmd
'show invisible chars'
set listchars=tab:→\ ,space:·,nbsp:␣,trail:•,eol:¶,precedes:«,extends:»
set list

## select_mark_bot_traffic_due_to_frequency.hql
-- make session dataset samller to be able to try things fast
--create table session_tryouts as select * from classifier_data_sorted a where a.sessionId in (select distinct s.sessionId from classifier_data_sorted s limit 100);
drop table if exists classifier_data_label;

create table
    classifier_data_label
as
select
    sessionId,
    (unix_timestamp(max(ts)) - unix_timestamp( min(ts))) as length,

## gist:643ecea99682be18d07c055522d1770a
spark-submit  --class org.wikimedia.analytics.refinery.job.AppSessionMetrics --master yarn  --num-executors=6 --executor-cores=2 --executor-memory=2g /mnt/hdfs/tmp/nuria/jars/refinery-job-0
.0.10-SNAPSHOT.jar  hdfs://analytics-hadoop/tmp/mobile-apps-sessions 2015 03 10
	// spark2-shell --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar

	/**
	* Use RefineTarget.find to find all Refine targets for an input (camus job) in the last N hours.
	* Then filter for any for which the _REFINED_FAILED flag exists.
	*/

	import import org.apache.hadoop.fs.Path
	import org.joda.time.format.DateTimeFormatter
	import com.github.nscala_time.time.Imports._
	// This is the EventStreams RecentChange stream endpoint
	var url = 'https://stream.wikimedia.org/v2/stream/recentchange';

	// Use EventSource (available in most browsers, or as an
	// npm module: https://www.npmjs.com/package/eventsource)
	// to subscribe to the stream.
	var recentChangeStream = new EventSource(url);

	// Print each event to the console
	recentChangeStream.onmessage = function(message) {
	curl -X POST 'http://localhost:8082/druid/v2/?pretty' -H 'Content-Type:application/json' -H 'Accept:application/json' -d '{
	"queryType":"segmentMetadata",
	"dataSource":"wmf_netflow",
	"intervals":["2019-09-01/2019-10-01"]

	}'
	# From stat1004:
	# pyspark2 --jars ~otto/spark-sql-kafka-0-10_2.11-2.3.1.jar,~otto/kafka-clients-1.1.0.jar
	# Need spark-sql-kafka for DataStream source and kafka-clients for Kafka serdes.

	from pyspark.sql.functions import *
	from pyspark.sql.types import *

	# Declare a Spark schema that matches the JSONData.
	# In a future MEP world this would be automatically loaded
	# from a JSONSchema.
	select CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
	count(1) as n_events
	from event.externalguidance
	where year=2019 and month=6
	and not useragent.is_bot
	and event.action = 'init'
	group by year, month, day
	order by date
	limit 1000000
	#!/usr/bin/python

	import sys
	import math

	f = sys.argv[1]

	_file = open(f)

	data = {}
	set number
	syntax enable
	set cursorline
	set showcmd
	'show invisible chars'
	set listchars=tab:→\ ,space:·,nbsp:␣,trail:•,eol:¶,precedes:«,extends:»
	set list
	-- make session dataset samller to be able to try things fast
	--create table session_tryouts as select * from classifier_data_sorted a where a.sessionId in (select distinct s.sessionId from classifier_data_sorted s limit 100);
	drop table if exists classifier_data_label;

	create table
	classifier_data_label
	as
	select
	sessionId,
	(unix_timestamp(max(ts)) - unix_timestamp( min(ts))) as length,
	spark-submit --class org.wikimedia.analytics.refinery.job.AppSessionMetrics --master yarn --num-executors=6 --executor-cores=2 --executor-memory=2g /mnt/hdfs/tmp/nuria/jars/refinery-job-0
	.0.10-SNAPSHOT.jar hdfs://analytics-hadoop/tmp/mobile-apps-sessions 2015 03 10