Skip to content

Instantly share code, notes, and snippets.

@ebernhardson
Last active April 29, 2019 17:54
Show Gist options
  • Save ebernhardson/fc95d2f1778c320321c617159a009afa to your computer and use it in GitHub Desktop.
Save ebernhardson/fc95d2f1778c320321c617159a009afa to your computer and use it in GitHub Desktop.
Poorly Performing Queries notebook
placeholder so i can push a notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"spark.sql(\"ADD JAR hdfs://analytics-hadoop/wmf/refinery/current/artifacts/refinery-hive.jar\").collect()\n",
"spark.sql(\"CREATE TEMPORARY FUNCTION get_main_search_request AS 'org.wikimedia.analytics.refinery.hive.GetMainSearchRequestUDF'\").collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"\n",
"date_start = datetime.datetime(2019, 3, 1)\n",
"date_end = datetime.datetime(2019, 3, 8)\n",
"max_res = 2\n",
"wiki = 'enwiki'\n",
"sample_size = 5000\n",
"max_q_by_day = 30\n",
"random_seed = 42"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import functions as F, types as T, Window\n",
"import calendar\n",
"\n",
"ts_start = calendar.timegm(date_start.timetuple())\n",
"ts_end = calendar.timegm(date_end.timetuple())\n",
"\n",
"row_timestamp = F.unix_timestamp(F.concat(\n",
" F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), \n",
" F.lit(' '), F.col('hour'), F.lit(':00:00')))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"w = Window.partitionBy(F.col('ip'), F.col('year'), F.col('month'), F.col('day'))\n",
"\n",
"results = (\n",
" spark.read.table('wmf_raw.cirrussearchrequestset')\n",
" .where(row_timestamp > ts_start)\n",
" .where(row_timestamp < ts_end)\n",
" .where(F.col('requests.queryType')[0] == 'near_match')\n",
" .withColumn('areq', F.expr('get_main_search_request(wikiid, requests)'))\n",
" .where(F.col('areq').isNotNull())\n",
" .where(F.col('areq.hitstotal') <= max_res)\n",
" .withColumn('q_by_day', F.count(F.lit(1)).over(w))\n",
" .where(F.col('q_by_day') <= max_q_by_day)\n",
" .groupBy(F.col('ip'), F.col('year'), F.col('month'), F.col('day'))\n",
" .agg(F.collect_list(F.struct('wikiid', 'areq.query', 'areq.hitsTotal')).alias('ip_requests'))\n",
" .withColumn('random_req_index', F.floor(F.rand(random_seed) * F.size(F.col('ip_requests'))))\n",
" .select(F.col('ip_requests')[F.col('random_req_index')].alias('req'))\n",
" .select(F.col('req.wikiid'), F.col('req.query'), F.col('req.hitsTotal'))\n",
" .toPandas()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results[results['query'].str.contains('\"')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"results[results['query'].str.contains(':')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"results[\"query\"] = results[\"query\"].replace(\"\\n\", \" \", regex=True)\n",
"results[\"query\"] = results[\"query\"].replace(\"\\t\", \" \", regex=True)\n",
"results.to_csv(\"/home/tjones/\" + wiki + \".txt\", sep=\"\\t\", index=False, encoding='utf-8', escapechar=None, quoting=csv.QUOTE_NONE, quotechar='\\0')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark - YARN",
"language": "python",
"name": "spark_yarn_pyspark"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment