Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@nuria
nuria / kafkacat-consume-headers.sh
Created May 3, 2021 05:04
kafkacat-consume-headers.sh
kafkacat -F ./kafkacat.auth -p 0 -t dlq-lcc-dy0v1 -C -f '\nKey (%K bytes): %k
Value (%S bytes): %s
Timestamp: %T
Partition: %p
Offset: %o
Headers: %h\n'
@nuria
nuria / redshift_config.json
Created December 18, 2020 00:48
redshift_config
[
{
"user_group": [
"dashboarding"
],
"query_group": [],
"name": "dashboarding",
"memory_percent_to_use": 70,
"query_concurrency": 15,
"concurrency_scaling": "auto"
@nuria
nuria / sessions.scala
Created October 2, 2020 20:36
sessions.scala
// spark-shell --jars /home/otto/algebird-core_2.10-0.9.0.jar,/home/mforns/refinery-core-0.0.9.jar
import java.util.Date
import java.text.SimpleDateFormat
import org.wikimedia.analytics.refinery.core.PageviewDefinition
import org.wikimedia.analytics.refinery.core.Webrequest
import scala.math.pow
import org.apache.spark.rdd.RDD
import com.twitter.algebird.QTree
import org.apache.spark.sql.{ Encoders, SaveMode }
val readPath = s"/wmf/data/event/PrefUpdate/year=*/month=*/day=*/hour=*/*.parquet"
val propertyWhitelistFilter = s"event.property in ('skin', 'mfMode', 'mf_amc_optin', 'VectorSkinVersion', 'popups', 'popupsreferencepreviews', 'discussiontools-betaenable', 'betafeatures-auto-enroll' , 'echo-notifications-blacklist', 'email-blacklist', 'growthexperiments-help-panel-tog-help-panel', 'growthexperiments-homepage-enable', 'growthexperiments-homepage-pt-link')"
case class UserAgent(
browser_family: String,
browser_major: String,
browser_minor: String,
use wmf;
select distinct base_name
from
mediarequest
where year=2019
and month=12
and day=1
and hour=1
and base_name like '%commons%'
@nuria
nuria / post_error_logging.sh
Created March 3, 2020 19:43
post-error-logging
curl -v -H 'Content-Type: text/plain' -d'{"$schema": "/mediawiki/client/error/1.0.0", "meta": {"stream": "mediawiki.client.error"}, "message": "test event", "type": "TEST", "url": "http://otto-test.org", "user_agent": "otto test"}' 'https://intake-logging.wikimedia.org/v1/events?hasty=true'
@nuria
nuria / test_error_logging.js
Last active March 6, 2020 00:33
test_error_logging
mw.config.values.wgWMEClientErrorIntakeURL
/**
{"$schema": "/mediawiki/client/error/1.0.0", "meta": {"stream": "mediawiki.client.error"}, "message": "test event", "type": "TEST", "url": "http://otto-test.org", "user_agent": "otto test"}
**/
mw.track( 'global.error', {
errorMessage: 'ayayaya',
url: 'https://intake-logging.wikimedia.org/v1/events?hasty=true',
lineNumber: 1,
columnNumber: 1,
@nuria
nuria / encoding_test.scala
Created February 28, 2020 17:55
encoding and spark
import java.net.URLDecoder
import java.net.URLEncoder
import org.apache.spark.sql.functions._
val urlDecoder = (u: String) => URLDecoder.decode(u.replaceAll("%(?![0-9a-fA-F]{2})", "%25").replaceAll("\\+", "%2B"), "UTF-8")
val urlEncoder = (u: String) => URLEncoder.encode(u, "UTF-8")
val countSlashes = (u: String) => u.count(_ == '/')
@nuria
nuria / spark-shell shothand
Created January 21, 2020 20:31
spark-shell
spark2-shell --master yarn --executor-memory 8G --executor-cores 4 --driver-memory 16G --conf spark.dynamicAllocation.maxExecutors=64 --conf spark.executor.memoryOverhead=2048 --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar,/srv/deployment/analytics/refinery/artifacts/refinery-hive.jar
Three levels job->attemp->application to reach cassandra
The cassandra jar newly build - with exclusions - is on /tmp/oozie-nuria
hdfs dfs -rmr /tmp/oozie-nuria ; hdfs dfs -mkdir /tmp/oozie-nuria; hdfs dfs -put oozie/* /tmp/oozie-nuria;
Start job: