Skip to content

Instantly share code, notes, and snippets.

View redshift_config.json
[
{
"user_group": [
"dashboarding"
],
"query_group": [],
"name": "dashboarding",
"memory_percent_to_use": 70,
"query_concurrency": 15,
"concurrency_scaling": "auto"
View sessions.scala
// spark-shell --jars /home/otto/algebird-core_2.10-0.9.0.jar,/home/mforns/refinery-core-0.0.9.jar
import java.util.Date
import java.text.SimpleDateFormat
import org.wikimedia.analytics.refinery.core.PageviewDefinition
import org.wikimedia.analytics.refinery.core.Webrequest
import scala.math.pow
import org.apache.spark.rdd.RDD
import com.twitter.algebird.QTree
View read-prefupdate.scala
import org.apache.spark.sql.{ Encoders, SaveMode }
val readPath = s"/wmf/data/event/PrefUpdate/year=*/month=*/day=*/hour=*/*.parquet"
val propertyWhitelistFilter = s"event.property in ('skin', 'mfMode', 'mf_amc_optin', 'VectorSkinVersion', 'popups', 'popupsreferencepreviews', 'discussiontools-betaenable', 'betafeatures-auto-enroll' , 'echo-notifications-blacklist', 'email-blacklist', 'growthexperiments-help-panel-tog-help-panel', 'growthexperiments-homepage-enable', 'growthexperiments-homepage-pt-link')"
case class UserAgent(
browser_family: String,
browser_major: String,
browser_minor: String,
View commons-files-viewed-from-wikis.hql
use wmf;
select distinct base_name
from
mediarequest
where year=2019
and month=12
and day=1
and hour=1
and base_name like '%commons%'
View post_error_logging.sh
curl -v -H 'Content-Type: text/plain' -d'{"$schema": "/mediawiki/client/error/1.0.0", "meta": {"stream": "mediawiki.client.error"}, "message": "test event", "type": "TEST", "url": "http://otto-test.org", "user_agent": "otto test"}' 'https://intake-logging.wikimedia.org/v1/events?hasty=true'
@nuria
nuria / test_error_logging.js
Last active Mar 6, 2020
test_error_logging
View test_error_logging.js
mw.config.values.wgWMEClientErrorIntakeURL
/**
{"$schema": "/mediawiki/client/error/1.0.0", "meta": {"stream": "mediawiki.client.error"}, "message": "test event", "type": "TEST", "url": "http://otto-test.org", "user_agent": "otto test"}
**/
mw.track( 'global.error', {
errorMessage: 'ayayaya',
url: 'https://intake-logging.wikimedia.org/v1/events?hasty=true',
lineNumber: 1,
columnNumber: 1,
View encoding_test.scala
import java.net.URLDecoder
import java.net.URLEncoder
import org.apache.spark.sql.functions._
val urlDecoder = (u: String) => URLDecoder.decode(u.replaceAll("%(?![0-9a-fA-F]{2})", "%25").replaceAll("\\+", "%2B"), "UTF-8")
val urlEncoder = (u: String) => URLEncoder.encode(u, "UTF-8")
val countSlashes = (u: String) => u.count(_ == '/')
View spark-shell shothand
spark2-shell --master yarn --executor-memory 8G --executor-cores 4 --driver-memory 16G --conf spark.dynamicAllocation.maxExecutors=64 --conf spark.executor.memoryOverhead=2048 --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar,/srv/deployment/analytics/refinery/artifacts/refinery-hive.jar
View hadoop-oozie-workflow
Three levels job->attemp->application to reach cassandra
The cassandra jar newly build - with exclusions - is on /tmp/oozie-nuria
hdfs dfs -rmr /tmp/oozie-nuria ; hdfs dfs -mkdir /tmp/oozie-nuria; hdfs dfs -put oozie/* /tmp/oozie-nuria;
Start job:
View FailedRefineTargets.scala
// spark2-shell --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar
/**
* Use RefineTarget.find to find all Refine targets for an input (camus job) in the last N hours.
* Then filter for any for which the _REFINED_FAILED flag exists.
*/
import import org.apache.hadoop.fs.Path
import org.joda.time.format.DateTimeFormatter
import com.github.nscala_time.time.Imports._