emtwo/engagement_features.ipynb Secret

## engagement_features.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Engagement Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "from pyspark.sql.types import *\n",
    "from datetime import date, timedelta\n",
    "from pyspark.mllib.stat import Statistics\n",
    "from pyspark.sql.window import Window\n",
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql.functions import col, count\n",
    "from pyspark.mllib.regression import LabeledPoint\n",
    "from pyspark.mllib.tree import DecisionTree, RandomForest, RandomForestModel\n",
    "from pyspark.mllib.classification import * \n",
    "from pyspark.mllib.evaluation import MulticlassMetrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Activity Stream Tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "activity_stream_events_daily_url = \"activity_stream_events_daily.csv\"\n",
    "activity_stream_stats_daily_url = \"activity_stream_stats_daily.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning:\n",
      "\n",
      "Columns (11,15,16) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n",
      "/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning:\n",
      "\n",
      "Columns (14,19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "pandas_events = pd.read_csv(activity_stream_events_daily_url, \",\")\n",
    "pandas_stats = pd.read_csv(activity_stream_stats_daily_url, \",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "integer_types = [\"max_scroll_depth\", \"load_latency\", \"total_bookmarks\", \"total_history_size\", \"session_duration\"]\n",
    "\n",
    "events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns]\n",
    "stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns]\n",
    "\n",
    "events_schema = StructType(events_fields)\n",
    "stats_schema = StructType(stats_fields)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema)\n",
    "activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, \"activity_stream_events_daily\")\n",
    "sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, \"activity_stream_stats_daily\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify \"Engaged\" Users - Returning at Least Once Per Week Over the Past Four Weeks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Find how many days ago our earliest datapoint is.\n",
    "earliest_date = activity_stream_events_daily_df \\\n",
    "    .select(\"date\") \\\n",
    "    .orderBy(\"date\").first()[0]\n",
    "earliest_datetime = datetime.datetime.strptime(earliest_date, \"%Y-%m-%d\")\n",
    "days_ago = (datetime.datetime.today() - earliest_datetime).days"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a dataframe of all the dates between now and days_ago\n",
    "base = datetime.datetime.today()\n",
    "date_list = [(datetime.datetime.today() - datetime.timedelta(days=x)).date() for x in range(0, days_ago)]\n",
    "pandas_df = pd.DataFrame(date_list, columns=[\"date\"])\n",
    "date_list_df = sqlContext.createDataFrame(pandas_df)\n",
    "sqlContext.registerDataFrameAsTable(date_list_df, \"date_list\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Map each client to all possible dates\n",
    "client_list_df = activity_stream_events_daily_df.select(\"client_id\").distinct()\n",
    "all_dates_clients_df = date_list_df.join(client_list_df)\n",
    "sqlContext.registerDataFrameAsTable(all_dates_clients_df, \"all_dates_clients\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a table of user interactions per day\n",
    "interactions_per_user_per_day_df = \\\n",
    "    activity_stream_events_daily_df \\\n",
    "    .select(\"client_id\", \"date\") \\\n",
    "    .groupBy(\"client_id\", \"date\") \\\n",
    "    .count().orderBy(\"client_id\", \"date\")\n",
    "interactions_per_user_per_day_df = interactions_per_user_per_day_df.withColumn(\"date\", col(\"date\").cast(\"date\"))\n",
    "sqlContext.registerDataFrameAsTable(interactions_per_user_per_day_df, \"interactions_per_user_per_day\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+----------+-----+\n",
      "|           client_id|      date|count|\n",
      "+--------------------+----------+-----+\n",
      "|00025585-a4e0-4cd...|2016-05-24|   13|\n",
      "|00025585-a4e0-4cd...|2016-05-25|    3|\n",
      "|00025585-a4e0-4cd...|2016-05-26|    1|\n",
      "|00025585-a4e0-4cd...|2016-05-27|    3|\n",
      "|00025585-a4e0-4cd...|2016-05-28|    3|\n",
      "|00025585-a4e0-4cd...|2016-05-30|    5|\n",
      "|00025585-a4e0-4cd...|2016-05-31|    8|\n",
      "|00025585-a4e0-4cd...|2016-06-01|    5|\n",
      "|00025585-a4e0-4cd...|2016-06-02|   10|\n",
      "|00025585-a4e0-4cd...|2016-06-03|   11|\n",
      "|00025585-a4e0-4cd...|2016-06-04|   10|\n",
      "|00025585-a4e0-4cd...|2016-06-05|    1|\n",
      "|00025585-a4e0-4cd...|2016-06-06|   17|\n",
      "|00025585-a4e0-4cd...|2016-06-07|    5|\n",
      "|00025585-a4e0-4cd...|2016-06-08|    3|\n",
      "|00025585-a4e0-4cd...|2016-06-09|    5|\n",
      "|00025585-a4e0-4cd...|2016-06-10|    3|\n",
      "|00025585-a4e0-4cd...|2016-06-13|    3|\n",
      "|00025585-a4e0-4cd...|2016-06-14|    5|\n",
      "|00025585-a4e0-4cd...|2016-06-15|    1|\n",
      "+--------------------+----------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "interactions_per_user_per_day_df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_users_dates_counts = sqlContext.sql(\n",
    "    \"SELECT all_dates_clients.client_id, all_dates_clients.date, COALESCE(interactions_per_user_per_day.count, 0) AS count \" +\n",
    "    \"FROM all_dates_clients \" +\n",
    "    \"LEFT JOIN interactions_per_user_per_day \" +\n",
    "    \"ON all_dates_clients.client_id = interactions_per_user_per_day.client_id \" +\n",
    "    \"AND all_dates_clients.date = interactions_per_user_per_day.date \" +\n",
    "    \"ORDER BY client_id, date\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[client_id: string, date: date, count: bigint]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_users_dates_counts.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Hive timestamp is interpreted as UNIX timestamp in seconds*\n",
    "days = lambda i: i * 86400 \n",
    "\n",
    "w = (Window()\n",
    "   .partitionBy(col(\"client_id\"))\n",
    "   .orderBy(col(\"date\").cast(\"timestamp\").cast(\"long\"))\n",
    "   .rangeBetween(-days(6), 0))\n",
    "\n",
    "weekly_avgs = all_users_dates_counts.select(col(\"client_id\"),\n",
    "                                            col(\"date\").cast(\"timestamp\"),\n",
    "                                            col(\"count\"),\n",
    "                                            mean(\"count\").over(w).alias(\"week_avg\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----+--------+\n",
      "|           client_id|                date|count|week_avg|\n",
      "+--------------------+--------------------+-----+--------+\n",
      "|01e6ca6d-0a0b-47b...|2016-04-08 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-09 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-10 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-11 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-12 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-13 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-14 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-15 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-16 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-17 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-18 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-19 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-20 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-21 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-22 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-23 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-24 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-25 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-26 00:00:...|    0|     0.0|\n",
      "|01e6ca6d-0a0b-47b...|2016-04-27 00:00:...|    0|     0.0|\n",
      "+--------------------+--------------------+-----+--------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "weekly_avgs.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4686"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The number of users who return at least once per week over the past 3 weeks\n",
    "engaged_users = weekly_avgs \\\n",
    "    .filter(col(\"date\") >= date_sub(current_date(), 21)) \\\n",
    "    .filter(col(\"date\") <= date_sub(current_date(), 0)) \\\n",
    "    .where(col(\"week_avg\") > 0) \\\n",
    "    .select(\"client_id\") \\\n",
    "    .groupBy(\"client_id\") \\\n",
    "    .count() \\\n",
    "    .where(col(\"count\") == 21) \\\n",
    "    .select(\"client_id\").distinct()\n",
    "\n",
    "engaged_users.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14742"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unengaged_users = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"date\") >= date_sub(current_date(), 21)) \\\n",
    "    .select(\"client_id\").distinct() \\\n",
    "    .subtract(engaged_users)\n",
    "\n",
    "unengaged_users.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2016-07-24\n",
      "Engaged Users: 4739\n",
      "Unengaged Users: 14905\n",
      "Total Users: 19644\n",
      "2016-07-17\n",
      "Engaged Users: 5083\n",
      "Unengaged Users: 15468\n",
      "Total Users: 20551\n",
      "2016-07-10\n",
      "Engaged Users: 5447\n",
      "Unengaged Users: 16314\n",
      "Total Users: 21761\n",
      "2016-07-03\n",
      "Engaged Users: 5110\n",
      "Unengaged Users: 18356\n",
      "Total Users: 23466\n",
      "2016-06-26\n",
      "Engaged Users: 4649\n",
      "Unengaged Users: 20669\n",
      "Total Users: 25318\n",
      "2016-06-19\n",
      "Engaged Users: 4695\n",
      "Unengaged Users: 21330\n",
      "Total Users: 26025\n",
      "2016-06-12\n",
      "Engaged Users: 4105\n",
      "Unengaged Users: 21180\n",
      "Total Users: 25285\n",
      "2016-06-05\n",
      "Engaged Users: 2622\n",
      "Unengaged Users: 22806\n",
      "Total Users: 25428\n"
     ]
    }
   ],
   "source": [
    "dates = []\n",
    "engaged = []\n",
    "unengaged = []\n",
    "for i in xrange(8):\n",
    "    engaged_users = weekly_avgs \\\n",
    "        .filter(col(\"date\") >= date_sub(current_date(), 7 * i + 21)) \\\n",
    "        .filter(col(\"date\") <= date_sub(current_date(), 7 * i)) \\\n",
    "        .where(col(\"week_avg\") > 0) \\\n",
    "        .select(\"client_id\") \\\n",
    "        .groupBy(\"client_id\") \\\n",
    "        .count() \\\n",
    "        .where(col(\"count\") == 21) \\\n",
    "        .select(\"client_id\").distinct()    \n",
    "    \n",
    "    unengaged_users = activity_stream_stats_daily_df \\\n",
    "        .filter(col(\"date\") >= date_sub(current_date(), 7 * i + 21)) \\\n",
    "        .filter(col(\"date\") <= date_sub(current_date(), 7 * i)) \\\n",
    "        .select(\"client_id\").distinct() \\\n",
    "        .subtract(engaged_users)\n",
    "\n",
    "    end_of_week = date.today() - timedelta(days=7 * i)\n",
    "    print end_of_week\n",
    "    print \"Engaged Users: \" + str(engaged_users.count())\n",
    "    print \"Unengaged Users: \" + str(unengaged_users.count())\n",
    "    print \"Total Users: \" + str(engaged_users.count() + unengaged_users.count())\n",
    "    dates.append(end_of_week)\n",
    "    engaged.append(engaged_users.count())\n",
    "    unengaged.append(unengaged_users.count())\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\"seamless=\"seamless\" src=\"https://plot.ly/~mozilla/940.embed\" height=\"525\" width=\"100%\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "\n",
    "trace1 = go.Bar(\n",
    "    x=dates,\n",
    "    y=engaged,\n",
    "    name='Engaged Users'\n",
    ")\n",
    "trace2 = go.Bar(\n",
    "    x=dates,\n",
    "    y=unengaged,\n",
    "    name='Unengaged Users'\n",
    ")\n",
    "\n",
    "data = [trace1, trace2]\n",
    "layout = go.Layout(\n",
    "    barmode='stack',\n",
    "    xaxis=dict(\n",
    "        title='3-Week Period End Date',\n",
    "        titlefont=dict(\n",
    "            family='Courier New, monospace',\n",
    "            size=18,\n",
    "            color='#7f7f7f'\n",
    "        )\n",
    "    ),\n",
    "    yaxis=dict(\n",
    "        title='Number of Users',\n",
    "        titlefont=dict(\n",
    "            family='Courier New, monospace',\n",
    "            size=18,\n",
    "            color='#7f7f7f'\n",
    "        )\n",
    "    )\n",
    ")\n",
    "\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "py.iplot(fig, filename='stacked-bar')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Aggregating the Various Component Metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loyalty Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------------+\n",
      "|           client_id|          loyalty|\n",
      "+--------------------+-----------------+\n",
      "|6ed543ad-1a37-f84...|99.91735537190083|\n",
      "|b736dd76-567c-4f4...|99.88331388564761|\n",
      "|3caf9141-fea2-4b4...|99.86962190352021|\n",
      "|201d3217-e202-8d4...|99.84639016897081|\n",
      "|f9fbb997-d1ee-4f7...|99.83948635634029|\n",
      "|8a4c60ba-08ea-482...|99.79166666666667|\n",
      "|1d78ab7e-0ee8-154...|99.79166666666667|\n",
      "|fb7b3bcb-151d-439...| 99.7907949790795|\n",
      "|ff486d55-cff8-184...|99.79035639412997|\n",
      "|8ee47177-9238-8c4...|99.77678571428571|\n",
      "|00ec0200-9faf-406...|99.77272727272727|\n",
      "|3f6162f3-9319-426...|99.76851851851852|\n",
      "|05083203-f7c2-450...|99.76689976689977|\n",
      "|8ff28fe8-6401-417...|99.76689976689977|\n",
      "|b0db015c-60c4-f84...|99.76635514018692|\n",
      "|6a7683fb-a6b5-451...|99.76190476190476|\n",
      "|697ea9a1-4909-a34...|99.76019184652279|\n",
      "|c9b5ef85-57db-c14...|99.75124378109453|\n",
      "|141623e8-f407-496...|99.74554707379136|\n",
      "|88685cc1-64a3-4be...|99.74160206718346|\n",
      "+--------------------+-----------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "13979"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Users who haven't used activity stream at all in the past 7 days have a loyalty index of 0.\n",
    "loyalty_index = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"date\") > date_sub(current_date(), 7)) \\\n",
    "    .select(\"client_id\") \\\n",
    "    .groupBy(\"client_id\").count() \\\n",
    "    .select(\"client_id\", ((1 - (1 / col(\"count\"))) * 100).alias(\"loyalty\")) \\\n",
    "    .orderBy(desc(\"loyalty\"))\n",
    "\n",
    "loyalty_index.show()\n",
    "loyalty_index.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Block and Delete Rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+---------------------+\n",
      "|           client_id|neg_interaction_index|\n",
      "+--------------------+---------------------+\n",
      "|efbf10b8-e7b9-4cc...|                100.0|\n",
      "|5b0c21a7-3d24-4c9...|                100.0|\n",
      "|5a5b465b-8954-41f...|                100.0|\n",
      "|6a0be346-b48d-48f...|                100.0|\n",
      "|8810a3b6-d858-4ec...|                100.0|\n",
      "|51dbdfbc-e5f8-4f8...|                100.0|\n",
      "|3a1adc85-e9ad-4a3...|                100.0|\n",
      "|12bdd0c3-89c8-4e0...|                100.0|\n",
      "|88af7c76-f1c0-45c...|                100.0|\n",
      "|62bb8554-65d3-40f...|                100.0|\n",
      "|3a05996a-d8c2-4c0...|                100.0|\n",
      "|8d38cb1b-3578-46e...|                100.0|\n",
      "|f794945f-d161-438...|                100.0|\n",
      "|d6a21da9-5088-454...|                100.0|\n",
      "|0e92be78-f80c-4c4...|                100.0|\n",
      "|f9e60b49-3119-4ee...|                100.0|\n",
      "|c4ccdec9-8796-f04...|                100.0|\n",
      "|d35948d9-1948-41f...|                100.0|\n",
      "|40f7e687-ad3a-47d...|                100.0|\n",
      "|f2ffa7c6-7a36-432...|                100.0|\n",
      "+--------------------+---------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Only includes sessions where session_id exists\n",
    "events = activity_stream_events_daily_df \\\n",
    "    .where((col(\"event\") == \"DELETE\") | (col(\"event\") == \"BLOCK\"))\n",
    "\n",
    "neg_interaction_rate = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"session_id\") != 'n/a') \\\n",
    "    .where(col(\"session_id\") != 'NaN') \\\n",
    "    .select(\"client_id\", \"session_id\") \\\n",
    "    .join(events, \\\n",
    "          activity_stream_stats_daily_df.session_id == events.session_id, \"outer\") \\\n",
    "    .select(activity_stream_stats_daily_df.client_id.alias(\"stats_client_id\"), \\\n",
    "            activity_stream_stats_daily_df.session_id.alias(\"stats_session_id\"), \\\n",
    "            events.client_id.alias(\"events_client_id\"), \\\n",
    "            events.session_id.alias(\"events_session_id\"))\n",
    " \n",
    "neg_interaction_index = neg_interaction_rate.groupBy(\"stats_client_id\") \\\n",
    "    .agg((countDistinct(neg_interaction_rate.events_session_id) / countDistinct(neg_interaction_rate.stats_session_id) * 100).alias(\"neg_interaction_index\")) \\\n",
    "    .na.drop().orderBy(desc(\"neg_interaction_index\")) \\\n",
    "    .select(col(\"stats_client_id\").alias(\"client_id\"), \"neg_interaction_index\")\n",
    "\n",
    "neg_interaction_index.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interaction Rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------------+\n",
      "|           client_id|interaction_index|\n",
      "+--------------------+-----------------+\n",
      "|7a6e1b37-c76d-4b7...|            100.0|\n",
      "|4ec69f00-1d4b-472...|            100.0|\n",
      "|21aa4214-8954-40e...|            100.0|\n",
      "|2a5e105a-7bdb-a64...|            100.0|\n",
      "|e367cbd3-9dce-434...|            100.0|\n",
      "|4ec9cb9e-7dfe-4db...|            100.0|\n",
      "|62794d3e-50f9-4eb...|            100.0|\n",
      "|13348859-13f0-49d...|            100.0|\n",
      "|e438e2ef-c475-488...|            100.0|\n",
      "|048ac47e-fb89-4e6...|            100.0|\n",
      "|f794945f-d161-438...|            100.0|\n",
      "|47f67bfe-3fd3-4fd...|            100.0|\n",
      "|27411796-6c7f-480...|            100.0|\n",
      "|0a0bed91-eb14-48c...|            100.0|\n",
      "|35906457-b148-4ea...|            100.0|\n",
      "|7f8272bc-417e-438...|            100.0|\n",
      "|8a073505-c22b-4ae...|            100.0|\n",
      "|22e44914-0a7d-4a3...|            100.0|\n",
      "|006b7c41-5034-5f4...|            100.0|\n",
      "|07b5669e-6112-46f...|            100.0|\n",
      "+--------------------+-----------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Only includes sessions where session_id exists\n",
    "positive_events = activity_stream_events_daily_df \\\n",
    "    .where(col(\"event\") != \"DELETE\") \\\n",
    "    .where(col(\"event\") != \"BLOCK\")\n",
    "\n",
    "interaction_rate = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"session_id\") != 'n/a') \\\n",
    "    .where(col(\"session_id\") != 'NaN') \\\n",
    "    .select(\"client_id\", \"session_id\") \\\n",
    "    .join(positive_events, \\\n",
    "          activity_stream_stats_daily_df.session_id == positive_events.session_id, \"outer\") \\\n",
    "    .select(activity_stream_stats_daily_df.client_id.alias(\"stats_client_id\"), \\\n",
    "            activity_stream_stats_daily_df.session_id.alias(\"stats_session_id\"), \\\n",
    "            positive_events.client_id.alias(\"events_client_id\"), \\\n",
    "            positive_events.session_id.alias(\"events_session_id\"))\n",
    " \n",
    "interaction_index = interaction_rate.groupBy(\"stats_client_id\") \\\n",
    "    .agg((countDistinct(interaction_rate.events_session_id) / countDistinct(interaction_rate.stats_session_id) * 100).alias(\"interaction_index\")) \\\n",
    "    .na.drop().orderBy(desc(\"interaction_index\")) \\\n",
    "    .select(col(\"stats_client_id\").alias(\"client_id\"), \"interaction_index\")\n",
    "\n",
    "interaction_index.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Recency Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------------+\n",
      "|           client_id|    recency_index|\n",
      "+--------------------+-----------------+\n",
      "|5858346a-8d61-403...|98.64864864864865|\n",
      "|a5d56ebb-6abb-480...|98.63013698630137|\n",
      "|ceb0f74a-c46d-42e...|98.63013698630137|\n",
      "|323263fb-45be-4f3...|98.63013698630137|\n",
      "|c1961f31-d1b4-4f7...|98.63013698630137|\n",
      "|e9b50d52-fcc8-4d3...|98.63013698630137|\n",
      "|ee5ec985-f1f9-42b...|98.63013698630137|\n",
      "|8ca2e77b-ff90-469...|98.63013698630137|\n",
      "|877da732-b6c1-4a6...|98.63013698630137|\n",
      "|35d0d5b6-21d1-414...|98.63013698630137|\n",
      "|6338c1e7-755e-449...|98.63013698630137|\n",
      "|e3c95daf-cd8a-409...|98.61111111111111|\n",
      "|ea9c0078-c6bb-439...|98.61111111111111|\n",
      "|42c7c6b1-90e3-45d...|98.61111111111111|\n",
      "|1def691d-5ca3-44f...|98.61111111111111|\n",
      "|583e64c4-c20b-428...|98.61111111111111|\n",
      "|59b23f7e-da53-4b6...|98.61111111111111|\n",
      "|cd356540-84b8-404...|98.59154929577466|\n",
      "|567fd585-3a27-488...|98.59154929577466|\n",
      "|98f01aff-91b0-439...|98.59154929577466|\n",
      "+--------------------+-----------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "33753"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wSpec = Window \\\n",
    "    .partitionBy(col(\"client_id\")) \\\n",
    "    .orderBy(desc(\"date\"))\n",
    "\n",
    "recency_index = activity_stream_events_daily_df \\\n",
    "    .select(\"client_id\", \"date\", coalesce(1 / datediff(col(\"date\"), lead(\"date\").over(wSpec)), lit(0.0)).alias(\"recency_index\")) \\\n",
    "    .groupBy(\"client_id\", \"date\").max(\"recency_index\").alias(\"recency_index\") \\\n",
    "    .orderBy(\"client_id\", \"date\") \\\n",
    "    .groupBy(\"client_id\").avg(\"max(recency_index)\").select(\"*\")\n",
    "    \n",
    "recency_index = recency_index \\\n",
    "    .select(\"client_id\", (col(recency_index.columns[1]) * 100).alias(\"recency_index\")) \\\n",
    "    .orderBy(desc(\"recency_index\"))\n",
    "\n",
    "recency_index.show()\n",
    "recency_index.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Session Duration Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+----------------------+\n",
      "|           client_id|session_duration_index|\n",
      "+--------------------+----------------------+\n",
      "|8a405600-9a10-4a4...|                 100.0|\n",
      "|f1d5ae90-3e6c-4e3...|                 100.0|\n",
      "|8e13812f-7b4a-4fa...|                 100.0|\n",
      "|ddfd6d88-367a-474...|                 100.0|\n",
      "|234540b6-6545-411...|                 100.0|\n",
      "|ca3ad84b-3c6c-481...|                 100.0|\n",
      "|36ca25d6-feb1-475...|                 100.0|\n",
      "|d60166de-69b1-401...|                 100.0|\n",
      "|663d941a-66ee-4f8...|                 100.0|\n",
      "|76154650-0e5e-a04...|                 100.0|\n",
      "|8f9df96f-42ee-4d7...|                 100.0|\n",
      "|e387dd3c-4f1d-497...|                 100.0|\n",
      "|765c083c-ae2a-4d5...|                 100.0|\n",
      "|8bd3b298-50d6-4e9...|                 100.0|\n",
      "|7af2b542-fe8d-4f3...|                 100.0|\n",
      "|9afeb00e-d1bc-467...|                 100.0|\n",
      "|74136c2e-da4d-4c8...|                 100.0|\n",
      "|cbd950dc-3f72-45a...|                 100.0|\n",
      "|8661ae81-d0f0-564...|                 100.0|\n",
      "|d00b3ab2-29a3-446...|                 100.0|\n",
      "+--------------------+----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Only includes sessions within the past 30 days\n",
    "session_counts = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"date\") > date_sub(current_date(), 30)) \\\n",
    "    .groupBy(\"client_id\").count() \\\n",
    "    .select(col(\"client_id\").alias(\"client_id1\"), col(\"count\").alias(\"count1\"))\n",
    "    \n",
    "long_session_counts = activity_stream_stats_daily_df \\\n",
    "    .where(col(\"date\") > date_sub(current_date(), 30)) \\\n",
    "    .where(col(\"session_duration\") >= 5500) \\\n",
    "    .groupBy(\"client_id\").count() \\\n",
    "    .select(col(\"client_id\").alias(\"client_id2\"), col(\"count\").alias(\"count2\"))\n",
    "\n",
    "session_duration_index = session_counts \\\n",
    "    .join(long_session_counts, session_counts.client_id1 == long_session_counts.client_id2) \\\n",
    "    .select(session_counts.client_id1, \\\n",
    "            (long_session_counts.count2 / session_counts.count1 * 100).alias(\"session_duration_index\")) \\\n",
    "    .orderBy(desc(\"session_duration_index\"))\n",
    "\n",
    "session_duration_index = session_duration_index.select(col(\"client_id1\").alias(\"client_id\"), \"session_duration_index\")\n",
    "\n",
    "session_duration_index.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average Interaction Count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+------------------+\n",
      "|           client_id|  avg_interactions|\n",
      "+--------------------+------------------+\n",
      "|09ad4eb8-8e45-4f2...|             9.725|\n",
      "|0a016923-fd82-439...|1.6842105263157894|\n",
      "|0b6bef36-baf6-46e...| 2.586206896551724|\n",
      "|1712a92f-5489-304...|               1.0|\n",
      "|17c7d60b-0b4e-44b...|               1.0|\n",
      "|2465b3c3-1827-49e...|               1.0|\n",
      "|3d5bab68-b285-473...|2.0303030303030303|\n",
      "|4a49a612-d836-491...|               2.0|\n",
      "|4aa2baf6-7277-433...|               2.0|\n",
      "|61e55a37-1885-4f4...|2.2222222222222223|\n",
      "|7a13ecb8-078d-4e8...|2.3333333333333335|\n",
      "|7a17b649-1a78-46c...|               1.4|\n",
      "|86a43b99-1dc5-46c...|               2.5|\n",
      "|a2946180-cd03-584...|            1.6875|\n",
      "|b1888ec7-2112-42c...|               1.0|\n",
      "|bdd2523b-f730-4a2...|               1.0|\n",
      "|cab7dfc7-6fb1-454...|               1.0|\n",
      "|cfc203b1-b9f2-a64...|               1.0|\n",
      "|d97ee8c8-43e8-418...|1.6153846153846154|\n",
      "|e9d57a52-27aa-464...|1.8888888888888888|\n",
      "+--------------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Compute avg interactions for each client_id\n",
    "avg_interactions = interactions_per_user_per_day_df \\\n",
    "    .groupBy(\"client_id\") \\\n",
    "    .avg(\"count\") \\\n",
    "    .select(col(\"client_id\"), col(\"avg(count)\").alias(\"avg_interactions\"))\n",
    "avg_interactions.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate Engaged User Features Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "|           client_id|          loyalty|     recency_index| interaction_index|session_duration_index|  avg_interactions|neg_interaction_index|\n",
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "|0653bd05-1463-450...|             96.0| 75.67460317460318| 27.77777777777778|     90.25974025974025| 2.642857142857143|                  0.0|\n",
      "|09ad4eb8-8e45-4f2...|98.50746268656717| 84.88005050505049|  57.3170731707317|     81.32183908045977|             9.725|  0.34843205574912894|\n",
      "|12a4cd26-1ee3-4f3...|98.41269841269842|  88.8888888888889| 35.12974051896208|     68.26568265682657| 4.066666666666666|   0.5988023952095809|\n",
      "|1403919f-ca6a-104...|96.96969696969697|  97.2972972972973| 64.26116838487972|     57.74647887323944|  5.27027027027027|    1.718213058419244|\n",
      "|1b7f23c1-329e-4d5...|94.11764705882352| 68.41036414565826|10.615711252653929|                  44.0| 2.764705882352941|                  0.0|\n",
      "|2ad75bfd-bfc5-46b...|98.85057471264368| 98.36065573770492| 84.57888493475683|     91.49888143176734|14.114754098360656|  0.11862396204033215|\n",
      "|2caa3ee3-545c-4cd...|             75.0| 58.47222222222223| 84.48275862068965|      92.3076923076923|2.7916666666666665|                  0.0|\n",
      "|348d742b-f8d2-4fb...|98.76543209876543| 62.16666666666666| 2.705882352941176|    42.095238095238095| 2.433333333333333|   0.1176470588235294|\n",
      "|389b289d-05e9-4f6...|97.14285714285714| 72.60683760683759|4.3478260869565215|                  62.4|1.7179487179487178|  0.37267080745341613|\n",
      "|38a89040-b377-43d...|             80.0|53.333333333333336| 60.71428571428571|                  44.0|1.7619047619047619|   1.7857142857142856|\n",
      "|3d5bab68-b285-473...|85.71428571428572|   70.988455988456| 38.84297520661157|    55.434782608695656|2.0303030303030303|                  0.0|\n",
      "|4142e8b7-2a78-47a...|91.66666666666666| 40.64814814814815|              19.0|     73.68421052631578|1.8333333333333333|                  1.0|\n",
      "|41e79e54-a77e-482...|99.23076923076923| 59.27083333333334| 2.913533834586466|     51.03448275862069|              2.25|  0.09398496240601503|\n",
      "|4405c630-fe3c-43c...|98.38709677419355| 89.42528735632183| 36.19246861924686|     72.45901639344262|3.9655172413793105|                  0.0|\n",
      "|4cbf7f71-dfac-44d...|97.36842105263158| 97.91666666666666| 82.88288288288288|     69.58762886597938|10.291666666666666|   1.2612612612612613|\n",
      "|612f02dc-eb7a-4c6...|94.11764705882352| 78.66666666666667| 45.13274336283185|     92.94117647058823|              2.76|   5.3097345132743365|\n",
      "|64ede5a3-7914-49c...|97.43589743589743| 94.87179487179486| 38.83495145631068|     76.96629213483146| 5.553846153846154|                  0.0|\n",
      "|6a38334e-2cf2-4b6...|98.63013698630137| 78.04347826086956| 6.557377049180328|    44.166666666666664| 14.91304347826087|    6.557377049180328|\n",
      "|762eb5b1-bca9-4ef...|91.66666666666666|              75.0| 84.40366972477065|     54.43037974683544|               4.0|                  0.0|\n",
      "|8cd05e12-61f6-45e...|             80.0| 66.61764705882354| 88.40579710144928|     81.81818181818183|2.2058823529411766|                  0.0|\n",
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "only showing top 20 rows\n",
      "\n",
      "4686\n"
     ]
    }
   ],
   "source": [
    "# Merge with recency index\n",
    "engaged_merged = engaged_users \\\n",
    "    .join(recency_index, engaged_users.client_id == recency_index.client_id) \\\n",
    "    .drop(engaged_users.client_id) \\\n",
    "\n",
    "# Merge with loyalty index\n",
    "engaged_merged = engaged_merged \\\n",
    "    .join(loyalty_index, engaged_merged.client_id == loyalty_index.client_id, \"outer\") \\\n",
    "    .select(engaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'loyalty': 0})\n",
    "\n",
    "# Merge with interaction index\n",
    "engaged_merged = engaged_merged \\\n",
    "    .join(interaction_index, engaged_merged.client_id == interaction_index.client_id, \"outer\") \\\n",
    "    .select(engaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'interaction_index': 0})\n",
    "    \n",
    "# Merge with session duration index\n",
    "engaged_merged = engaged_merged \\\n",
    "    .join(session_duration_index, engaged_merged.client_id == session_duration_index.client_id, \"outer\") \\\n",
    "    .select(engaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'session_duration_index': 0})\n",
    "\n",
    "# Merge with avg interactions\n",
    "engaged_merged = engaged_merged \\\n",
    "    .join(avg_interactions, engaged_merged.client_id == avg_interactions.client_id, \"outer\") \\\n",
    "    .select(engaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\", \"avg_interactions\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'avg_interactions': 0})\n",
    "    \n",
    "# Merge with neg interaction index\n",
    "engaged_merged = engaged_merged \\\n",
    "    .join(neg_interaction_index, engaged_merged.client_id == neg_interaction_index.client_id, \"outer\") \\\n",
    "    .select(engaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\", \"avg_interactions\", \"neg_interaction_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'neg_interaction_index': 0})\n",
    "    \n",
    "    \n",
    "engaged_merged.show()\n",
    "print engaged_merged.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Row(avg(loyalty)=89.21253148964226)]\n",
      "[Row(avg(recency_index)=73.92858847678862)]\n",
      "[Row(avg(interaction_index)=36.34412466080004)]\n",
      "[Row(avg(neg_interaction_index)=0.9039897561336679)]\n",
      "[Row(avg(session_duration_index)=64.46732204561148)]\n"
     ]
    }
   ],
   "source": [
    "print engaged_merged.agg(avg(col(\"loyalty\"))).take(1)\n",
    "print engaged_merged.agg(avg(col(\"recency_index\"))).take(1)\n",
    "print engaged_merged.agg(avg(col(\"interaction_index\"))).take(1)\n",
    "print engaged_merged.agg(avg(col(\"neg_interaction_index\"))).take(1)\n",
    "print engaged_merged.agg(avg(col(\"session_duration_index\"))).take(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate Unengaged User Features Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "|           client_id|          loyalty|     recency_index| interaction_index|session_duration_index|  avg_interactions|neg_interaction_index|\n",
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "|03fd18b7-cfcb-4cd...|              0.0| 61.23188405797102| 59.64912280701754|      96.7741935483871| 2.608695652173913|   12.280701754385964|\n",
      "|0a016923-fd82-439...|98.11320754716981|  64.8893065998329| 4.703476482617587|     79.92700729927007|1.6842105263157894|                  0.0|\n",
      "|0b6bef36-baf6-46e...|92.85714285714286|62.523510971786834|31.818181818181817|     34.84848484848485| 2.586206896551724|                  0.0|\n",
      "|1712a92f-5489-304...|              0.0| 4.166666666666666|              12.5|                 100.0|               1.0|                  0.0|\n",
      "|1c6e939d-8b47-44b...|98.63013698630137| 31.02272727272728| 3.206997084548105|    52.118644067796616|2.1818181818181817|   0.2915451895043732|\n",
      "|1d8256ec-7342-43b...|             80.0|50.829725829725824| 53.84615384615385|     76.92307692307693| 2.227272727272727|                  0.0|\n",
      "|1d8a5a4a-70a5-445...|95.65217391304348| 3.421052631578948|0.3225806451612903|    45.664739884393065|               1.0|   0.3225806451612903|\n",
      "|1ed0b7f2-4f8b-483...|97.95918367346938|0.8771929824561403|0.2652519893899204|    53.159851301115246|               1.5|                  0.0|\n",
      "|2098bfd0-40c7-764...|              0.0| 3.341687552213868|1.2422360248447204|     61.53846153846154|2.3333333333333335|                  0.0|\n",
      "|2293bf9f-ee57-4cb...|              0.0|49.999999999999986|           45.3125|     86.11111111111111|1.7222222222222223|                  0.0|\n",
      "|2363c2cc-7f94-467...|              0.0|               0.0|               0.0|    12.121212121212121|               1.0|                  0.0|\n",
      "|2e004da8-945b-402...|             50.0| 27.20797720797721|              50.0|                 100.0|1.3333333333333333|                  0.0|\n",
      "|2f586e36-2df5-43a...|95.83333333333334| 53.32451499118166|2.3529411764705883|     47.31182795698925|               4.0|   2.3529411764705883|\n",
      "|2ff6cb1c-fabe-4e2...|              0.0|  43.6026936026936| 2.564102564102564|     68.55123674911661|1.4444444444444444|                  0.0|\n",
      "|38745a43-31fc-804...|             96.0|12.895768833849331|3.1088082901554404|     64.22764227642277|               1.2|                  0.0|\n",
      "|3a6c14f4-2f83-4d2...|              0.0|11.527777777777779|              10.0|                  62.5|               2.0|                  0.0|\n",
      "|3ad43720-f5b8-4bf...|97.22222222222221|14.444444444444446|0.5405405405405406|     59.58549222797927|               1.0|   0.2702702702702703|\n",
      "|4aa2baf6-7277-433...|              0.0|               0.0|             100.0|                  50.0|               2.0|                  0.0|\n",
      "|55080658-c5b8-404...|              0.0| 2.380952380952381|15.384615384615385|     88.88888888888889|               1.0|                  0.0|\n",
      "|55c3e4ca-a87f-411...|              0.0| 48.98989898989899| 62.16216216216216|                  87.5|2.4615384615384617|                  0.0|\n",
      "+--------------------+-----------------+------------------+------------------+----------------------+------------------+---------------------+\n",
      "only showing top 20 rows\n",
      "\n",
      "13506\n"
     ]
    }
   ],
   "source": [
    "# Merge with recency index\n",
    "unengaged_merged = unengaged_users \\\n",
    "    .join(recency_index, unengaged_users.client_id == recency_index.client_id) \\\n",
    "    .drop(unengaged_users.client_id)\n",
    "\n",
    "# Merge with loyalty index\n",
    "unengaged_merged = unengaged_merged \\\n",
    "    .join(loyalty_index, unengaged_merged.client_id == loyalty_index.client_id, \"outer\") \\\n",
    "    .select(unengaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'loyalty': 0})\n",
    "\n",
    "# Merge with interaction index\n",
    "unengaged_merged = unengaged_merged \\\n",
    "    .join(interaction_index, unengaged_merged.client_id == interaction_index.client_id, \"outer\") \\\n",
    "    .select(unengaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'interaction_index': 0})\n",
    "    \n",
    "# Merge with session duration index\n",
    "unengaged_merged = unengaged_merged \\\n",
    "    .join(session_duration_index, unengaged_merged.client_id == session_duration_index.client_id, \"outer\") \\\n",
    "    .select(unengaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'session_duration_index': 0})\n",
    "\n",
    "# Merge with avg interactions\n",
    "unengaged_merged = unengaged_merged \\\n",
    "    .join(avg_interactions, unengaged_merged.client_id == avg_interactions.client_id, \"outer\") \\\n",
    "    .select(unengaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\", \"avg_interactions\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'avg_interactions': 0})\n",
    "    \n",
    "    \n",
    "# Merge with neg interaction index\n",
    "unengaged_merged = unengaged_merged \\\n",
    "    .join(neg_interaction_index, unengaged_merged.client_id == neg_interaction_index.client_id, \"outer\") \\\n",
    "    .select(unengaged_merged.client_id.alias(\"client_id\"), \"loyalty\", \"recency_index\", \"interaction_index\", \"session_duration_index\", \"avg_interactions\", \"neg_interaction_index\") \\\n",
    "    .na.drop('any', None, [\"client_id\"]) \\\n",
    "    .na.fill({'neg_interaction_index': 0})\n",
    "    \n",
    "unengaged_merged.show()\n",
    "print unengaged_merged.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Row(avg(loyalty)=50.040966179128525)]\n",
      "[Row(avg(recency_index)=33.92463385332862)]\n",
      "[Row(avg(interaction_index)=19.640492853593006)]\n",
      "[Row(avg(neg_interaction_index)=0.8667846971471196)]\n",
      "[Row(avg(session_duration_index)=61.39640348508209)]\n"
     ]
    }
   ],
   "source": [
    "print unengaged_merged.agg(avg(col(\"loyalty\"))).take(1)\n",
    "print unengaged_merged.agg(avg(col(\"recency_index\"))).take(1)\n",
    "print unengaged_merged.agg(avg(col(\"interaction_index\"))).take(1)\n",
    "print unengaged_merged.agg(avg(col(\"neg_interaction_index\"))).take(1)\n",
    "print unengaged_merged.agg(avg(col(\"session_duration_index\"))).take(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "645\n",
      "4041\n"
     ]
    }
   ],
   "source": [
    "engaged_subset = engaged_merged.randomSplit([1.0, 6.0])\n",
    "print engaged_subset[0].count()\n",
    "print engaged_subset[1].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3868\n",
      "9638\n"
     ]
    }
   ],
   "source": [
    "unengaged_subset = unengaged_merged.randomSplit([8.0, 20.0])\n",
    "print unengaged_subset[0].count()\n",
    "print unengaged_subset[1].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "engaged_data = []\n",
    "unengaged_data = []\n",
    "\n",
    "for row in engaged_subset[1].collect():\n",
    "    engaged_data.append(LabeledPoint(0, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))\n",
    "\n",
    "for row in unengaged_subset[0].collect():\n",
    "    unengaged_data.append(LabeledPoint(1, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "engaged_test_data = []\n",
    "unengaged_test_data = []\n",
    "\n",
    "for row in engaged_subset[0].collect():\n",
    "    engaged_test_data.append(LabeledPoint(0, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))\n",
    "\n",
    "for row in unengaged_subset[1].collect():\n",
    "    unengaged_test_data.append(LabeledPoint(1, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train an SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = sc.parallelize(engaged_data + unengaged_data)\n",
    "mcm = SVMWithSGD.train(data, iterations=100, regParam=0.001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Engaged Accuracy: 0.458914728682\n",
      "Unengaged Accuracy: 0.865532268105\n",
      "Total Accuracy: 0.840027229408\n",
      "Class 0.0 precision = 0.185929648241\n",
      "Class 0.0 recall = 0.458914728682\n",
      "Class 1.0 precision = 0.959843516281\n",
      "Class 1.0 recall = 0.865532268105\n"
     ]
    }
   ],
   "source": [
    "score_and_labels = []\n",
    "\n",
    "test_correct_engaged = 0\n",
    "for item in engaged_test_data:\n",
    "    prediction = mcm.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_engaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Engaged Accuracy: \" + str(test_correct_engaged / float(len(engaged_test_data)))\n",
    "\n",
    "test_correct_unengaged = 0\n",
    "for item in unengaged_test_data:\n",
    "    prediction = mcm.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_unengaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Unengaged Accuracy: \" + str(test_correct_unengaged / float(len(unengaged_test_data)))\n",
    "\n",
    "print \"Total Accuracy: \" + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))\n",
    "\n",
    "metrics = MulticlassMetrics(sc.parallelize(score_and_labels))\n",
    "\n",
    "# Statistics by class\n",
    "labels = [float(1), float(0)]\n",
    "for label in sorted(labels):\n",
    "    print(\"Class %s precision = %s\" % (label, metrics.precision(label)))\n",
    "    print(\"Class %s recall = %s\" % (label, metrics.recall(label)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lr = LogisticRegressionWithSGD.train(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Engaged Accuracy: 0.996899224806\n",
      "Unengaged Accuracy: 0.353807843951\n",
      "Total Accuracy: 0.394145677332\n",
      "Class 0.0 precision = 0.0935817202736\n",
      "Class 0.0 recall = 0.996899224806\n",
      "Class 1.0 precision = 0.999413833529\n",
      "Class 1.0 recall = 0.353807843951\n"
     ]
    }
   ],
   "source": [
    "score_and_labels = []\n",
    "\n",
    "test_correct_engaged = 0\n",
    "for item in engaged_test_data:\n",
    "    prediction = lr.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_engaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Engaged Accuracy: \" + str(test_correct_engaged / float(len(engaged_test_data)))\n",
    "\n",
    "test_correct_unengaged = 0\n",
    "for item in unengaged_test_data:\n",
    "    prediction = lr.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_unengaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Unengaged Accuracy: \" + str(test_correct_unengaged / float(len(unengaged_test_data)))\n",
    "\n",
    "print \"Total Accuracy: \" + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))\n",
    "\n",
    "metrics = MulticlassMetrics(sc.parallelize(score_and_labels))\n",
    "\n",
    "# Statistics by class\n",
    "labels = [float(1), float(0)]\n",
    "for label in sorted(labels):\n",
    "    print(\"Class %s precision = %s\" % (label, metrics.precision(label)))\n",
    "    print(\"Class %s recall = %s\" % (label, metrics.recall(label)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dt = DecisionTree.trainClassifier(data, 2, {})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Engaged Accuracy: 0.902325581395\n",
      "Unengaged Accuracy: 0.817908279726\n",
      "Total Accuracy: 0.823203345327\n",
      "Class 0.0 precision = 0.249037227214\n",
      "Class 0.0 recall = 0.902325581395\n",
      "Class 1.0 precision = 0.992071482507\n",
      "Class 1.0 recall = 0.817908279726\n"
     ]
    }
   ],
   "source": [
    "score_and_labels = []\n",
    "\n",
    "test_correct_engaged = 0\n",
    "for item in engaged_test_data:\n",
    "    prediction = dt.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_engaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Engaged Accuracy: \" + str(test_correct_engaged / float(len(engaged_test_data)))\n",
    "\n",
    "test_correct_unengaged = 0\n",
    "for item in unengaged_test_data:\n",
    "    prediction = dt.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_unengaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Unengaged Accuracy: \" + str(test_correct_unengaged / float(len(unengaged_test_data)))\n",
    "\n",
    "print \"Total Accuracy: \" + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))\n",
    "\n",
    "metrics = MulticlassMetrics(sc.parallelize(score_and_labels))\n",
    "\n",
    "# Statistics by class\n",
    "labels = [float(1), float(0)]\n",
    "for label in sorted(labels):\n",
    "    print(\"Class %s precision = %s\" % (label, metrics.precision(label)))\n",
    "    print(\"Class %s recall = %s\" % (label, metrics.recall(label)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rf = RandomForest.trainClassifier(data, 2, {}, numTrees=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Engaged Accuracy: 0.886821705426\n",
      "Unengaged Accuracy: 0.843328491388\n",
      "Total Accuracy: 0.846056598269\n",
      "Class 0.0 precision = 0.274735830932\n",
      "Class 0.0 recall = 0.886821705426\n",
      "Class 1.0 precision = 0.991098646507\n",
      "Class 1.0 recall = 0.843328491388\n"
     ]
    }
   ],
   "source": [
    "score_and_labels = []\n",
    "\n",
    "test_correct_engaged = 0\n",
    "for item in engaged_test_data:\n",
    "    prediction = rf.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_engaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Engaged Accuracy: \" + str(test_correct_engaged / float(len(engaged_test_data)))\n",
    "\n",
    "test_correct_unengaged = 0\n",
    "for item in unengaged_test_data:\n",
    "    prediction = rf.predict(item.features)\n",
    "    if prediction == item.label:\n",
    "        test_correct_unengaged += 1\n",
    "    score_and_labels.append((float(prediction), item.label))\n",
    "\n",
    "print \"Unengaged Accuracy: \" + str(test_correct_unengaged / float(len(unengaged_test_data)))\n",
    "\n",
    "print \"Total Accuracy: \" + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))\n",
    "\n",
    "metrics = MulticlassMetrics(sc.parallelize(score_and_labels))\n",
    "\n",
    "# Statistics by class\n",
    "labels = [float(1), float(0)]\n",
    "for label in sorted(labels):\n",
    "    print(\"Class %s precision = %s\" % (label, metrics.precision(label)))\n",
    "    print(\"Class %s recall = %s\" % (label, metrics.recall(label)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's see if there is a correlation between average number of interactions and the various engagement metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"recency_index\", \"avg_interactions\")\n",
    "print unengaged_merged.corr(\"recency_index\", \"avg_interactions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"loyalty\", \"avg_interactions\")\n",
    "print unengaged_merged.corr(\"loyalty\", \"avg_interactions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"session_duration_index\", \"avg_interactions\")\n",
    "print unengaged_merged.corr(\"session_duration_index\", \"avg_interactions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"interaction_index\", \"avg_interactions\")\n",
    "print unengaged_merged.corr(\"interaction_index\", \"avg_interactions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"interaction_index\", \"loyalty\")\n",
    "print unengaged_merged.corr(\"interaction_index\", \"loyalty\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"interaction_index\", \"recency_index\")\n",
    "print unengaged_merged.corr(\"interaction_index\", \"recency_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"interaction_index\", \"session_duration_index\")\n",
    "print unengaged_merged.corr(\"interaction_index\", \"session_duration_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"interaction_index\", \"neg_interaction_index\")\n",
    "print unengaged_merged.corr(\"interaction_index\", \"neg_interaction_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"loyalty\", \"recency_index\")\n",
    "print unengaged_merged.corr(\"loyalty\", \"recency_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"loyalty\", \"session_duration_index\")\n",
    "print unengaged_merged.corr(\"loyalty\", \"session_duration_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"loyalty\", \"neg_interaction_index\")\n",
    "print unengaged_merged.corr(\"loyalty\", \"neg_interaction_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"recency_index\", \"session_duration_index\")\n",
    "print unengaged_merged.corr(\"recency_index\", \"session_duration_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"recency_index\", \"neg_interaction_index\")\n",
    "print unengaged_merged.corr(\"recency_index\", \"neg_interaction_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print engaged_merged.corr(\"session_duration_index\", \"neg_interaction_index\")\n",
    "print unengaged_merged.corr(\"session_duration_index\", \"neg_interaction_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## engagement_features.py

# coding: utf-8

# # Engagement Features

# In[55]:

import datetime
import numpy as np
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from pyspark.sql.types import *
from datetime import date, timedelta
from pyspark.mllib.stat import Statistics
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import col, count
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, RandomForest, RandomForestModel
from pyspark.mllib.classification import *
from pyspark.mllib.evaluation import MulticlassMetrics


# ## Import Activity Stream Tables

# In[3]:

activity_stream_events_daily_url = "activity_stream_events_daily.csv"
activity_stream_stats_daily_url = "activity_stream_stats_daily.csv"


# In[4]:

pandas_events = pd.read_csv(activity_stream_events_daily_url, ",")
pandas_stats = pd.read_csv(activity_stream_stats_daily_url, ",")


# In[5]:

integer_types = ["max_scroll_depth", "load_latency", "total_bookmarks", "total_history_size", "session_duration"]

events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns]
stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns]

events_schema = StructType(events_fields)
stats_schema = StructType(stats_fields)


# In[7]:

activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema)
activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema)


# In[8]:

sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, "activity_stream_events_daily")
sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, "activity_stream_stats_daily")


# ## Identify "Engaged" Users - Returning at Least Once Per Week Over the Past Four Weeks

# In[9]:

# Find how many days ago our earliest datapoint is.
earliest_date = activity_stream_events_daily_df     .select("date")     .orderBy("date").first()[0]
earliest_datetime = datetime.datetime.strptime(earliest_date, "%Y-%m-%d")
days_ago = (datetime.datetime.today() - earliest_datetime).days


# In[10]:

# Create a dataframe of all the dates between now and days_ago
base = datetime.datetime.today()
date_list = [(datetime.datetime.today() - datetime.timedelta(days=x)).date() for x in range(0, days_ago)]
pandas_df = pd.DataFrame(date_list, columns=["date"])
date_list_df = sqlContext.createDataFrame(pandas_df)
sqlContext.registerDataFrameAsTable(date_list_df, "date_list")


# In[11]:

# Map each client to all possible dates
client_list_df = activity_stream_events_daily_df.select("client_id").distinct()
all_dates_clients_df = date_list_df.join(client_list_df)
sqlContext.registerDataFrameAsTable(all_dates_clients_df, "all_dates_clients")


# In[12]:

# Create a table of user interactions per day
interactions_per_user_per_day_df =     activity_stream_events_daily_df     .select("client_id", "date")     .groupBy("client_id", "date")     .count().orderBy("client_id", "date")
interactions_per_user_per_day_df = interactions_per_user_per_day_df.withColumn("date", col("date").cast("date"))
sqlContext.registerDataFrameAsTable(interactions_per_user_per_day_df, "interactions_per_user_per_day")


# In[13]:

interactions_per_user_per_day_df.show()


# In[14]:

all_users_dates_counts = sqlContext.sql(
    "SELECT all_dates_clients.client_id, all_dates_clients.date, COALESCE(interactions_per_user_per_day.count, 0) AS count " +
    "FROM all_dates_clients " +
    "LEFT JOIN interactions_per_user_per_day " +
    "ON all_dates_clients.client_id = interactions_per_user_per_day.client_id " +
    "AND all_dates_clients.date = interactions_per_user_per_day.date " +
    "ORDER BY client_id, date")


# In[15]:

all_users_dates_counts.cache()


# In[16]:

# Hive timestamp is interpreted as UNIX timestamp in seconds*
days = lambda i: i * 86400

w = (Window()
   .partitionBy(col("client_id"))
   .orderBy(col("date").cast("timestamp").cast("long"))
   .rangeBetween(-days(6), 0))

weekly_avgs = all_users_dates_counts.select(col("client_id"),
                                            col("date").cast("timestamp"),
                                            col("count"),
                                            mean("count").over(w).alias("week_avg"))


# In[17]:

weekly_avgs.show()


# In[18]:

# The number of users who return at least once per week over the past 3 weeks
engaged_users = weekly_avgs     .filter(col("date") >= date_sub(current_date(), 21))     .filter(col("date") <= date_sub(current_date(), 0))     .where(col("week_avg") > 0)     .select("client_id")     .groupBy("client_id")     .count()     .where(col("count") == 21)     .select("client_id").distinct()

engaged_users.count()


# In[19]:

unengaged_users = activity_stream_stats_daily_df     .where(col("date") >= date_sub(current_date(), 21))     .select("client_id").distinct()     .subtract(engaged_users)

unengaged_users.count()


# In[21]:

dates = []
engaged = []
unengaged = []
for i in xrange(8):
    engaged_users = weekly_avgs         .filter(col("date") >= date_sub(current_date(), 7 * i + 21))         .filter(col("date") <= date_sub(current_date(), 7 * i))         .where(col("week_avg") > 0)         .select("client_id")         .groupBy("client_id")         .count()         .where(col("count") == 21)         .select("client_id").distinct()

    unengaged_users = activity_stream_stats_daily_df         .filter(col("date") >= date_sub(current_date(), 7 * i + 21))         .filter(col("date") <= date_sub(current_date(), 7 * i))         .select("client_id").distinct()         .subtract(engaged_users)

    end_of_week = date.today() - timedelta(days=7 * i)
    print end_of_week
    print "Engaged Users: " + str(engaged_users.count())
    print "Unengaged Users: " + str(unengaged_users.count())
    print "Total Users: " + str(engaged_users.count() + unengaged_users.count())
    dates.append(end_of_week)
    engaged.append(engaged_users.count())
    unengaged.append(unengaged_users.count())


# In[22]:

import plotly.plotly as py
import plotly.graph_objs as go

trace1 = go.Bar(
    x=dates,
    y=engaged,
    name='Engaged Users'
)
trace2 = go.Bar(
    x=dates,
    y=unengaged,
    name='Unengaged Users'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    xaxis=dict(
        title='3-Week Period End Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Number of Users',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')


# # Aggregating the Various Component Metrics

# ## Loyalty Index

# In[20]:

# Users who haven't used activity stream at all in the past 7 days have a loyalty index of 0.
loyalty_index = activity_stream_stats_daily_df     .where(col("date") > date_sub(current_date(), 7))     .select("client_id")     .groupBy("client_id").count()     .select("client_id", ((1 - (1 / col("count"))) * 100).alias("loyalty"))     .orderBy(desc("loyalty"))

loyalty_index.show()
loyalty_index.count()


# ## Block and Delete Rate

# In[21]:

# Only includes sessions where session_id exists
events = activity_stream_events_daily_df     .where((col("event") == "DELETE") | (col("event") == "BLOCK"))

neg_interaction_rate = activity_stream_stats_daily_df     .where(col("session_id") != 'n/a')     .where(col("session_id") != 'NaN')     .select("client_id", "session_id")     .join(events,           activity_stream_stats_daily_df.session_id == events.session_id, "outer")     .select(activity_stream_stats_daily_df.client_id.alias("stats_client_id"),             activity_stream_stats_daily_df.session_id.alias("stats_session_id"),             events.client_id.alias("events_client_id"),             events.session_id.alias("events_session_id"))

neg_interaction_index = neg_interaction_rate.groupBy("stats_client_id")     .agg((countDistinct(neg_interaction_rate.events_session_id) / countDistinct(neg_interaction_rate.stats_session_id) * 100).alias("neg_interaction_index"))     .na.drop().orderBy(desc("neg_interaction_index"))     .select(col("stats_client_id").alias("client_id"), "neg_interaction_index")

neg_interaction_index.show()


# ## Interaction Rate

# In[22]:

# Only includes sessions where session_id exists
positive_events = activity_stream_events_daily_df     .where(col("event") != "DELETE")     .where(col("event") != "BLOCK")

interaction_rate = activity_stream_stats_daily_df     .where(col("session_id") != 'n/a')     .where(col("session_id") != 'NaN')     .select("client_id", "session_id")     .join(positive_events,           activity_stream_stats_daily_df.session_id == positive_events.session_id, "outer")     .select(activity_stream_stats_daily_df.client_id.alias("stats_client_id"),             activity_stream_stats_daily_df.session_id.alias("stats_session_id"),             positive_events.client_id.alias("events_client_id"),             positive_events.session_id.alias("events_session_id"))

interaction_index = interaction_rate.groupBy("stats_client_id")     .agg((countDistinct(interaction_rate.events_session_id) / countDistinct(interaction_rate.stats_session_id) * 100).alias("interaction_index"))     .na.drop().orderBy(desc("interaction_index"))     .select(col("stats_client_id").alias("client_id"), "interaction_index")

interaction_index.show()


# ## Recency Index

# In[23]:

wSpec = Window     .partitionBy(col("client_id"))     .orderBy(desc("date"))

recency_index = activity_stream_events_daily_df     .select("client_id", "date", coalesce(1 / datediff(col("date"), lead("date").over(wSpec)), lit(0.0)).alias("recency_index"))     .groupBy("client_id", "date").max("recency_index").alias("recency_index")     .orderBy("client_id", "date")     .groupBy("client_id").avg("max(recency_index)").select("*")

recency_index = recency_index     .select("client_id", (col(recency_index.columns[1]) * 100).alias("recency_index"))     .orderBy(desc("recency_index"))

recency_index.show()
recency_index.count()


# ## Session Duration Index

# In[24]:

# Only includes sessions within the past 30 days
session_counts = activity_stream_stats_daily_df     .where(col("date") > date_sub(current_date(), 30))     .groupBy("client_id").count()     .select(col("client_id").alias("client_id1"), col("count").alias("count1"))

long_session_counts = activity_stream_stats_daily_df     .where(col("date") > date_sub(current_date(), 30))     .where(col("session_duration") >= 5500)     .groupBy("client_id").count()     .select(col("client_id").alias("client_id2"), col("count").alias("count2"))

session_duration_index = session_counts     .join(long_session_counts, session_counts.client_id1 == long_session_counts.client_id2)     .select(session_counts.client_id1,             (long_session_counts.count2 / session_counts.count1 * 100).alias("session_duration_index"))     .orderBy(desc("session_duration_index"))

session_duration_index = session_duration_index.select(col("client_id1").alias("client_id"), "session_duration_index")

session_duration_index.show()


# ## Average Interaction Count

# In[25]:

# Compute avg interactions for each client_id
avg_interactions = interactions_per_user_per_day_df     .groupBy("client_id")     .avg("count")     .select(col("client_id"), col("avg(count)").alias("avg_interactions"))
avg_interactions.show()


# ## Generate Engaged User Features Table

# In[26]:

# Merge with recency index
engaged_merged = engaged_users     .join(recency_index, engaged_users.client_id == recency_index.client_id)     .drop(engaged_users.client_id)
# Merge with loyalty index
engaged_merged = engaged_merged     .join(loyalty_index, engaged_merged.client_id == loyalty_index.client_id, "outer")     .select(engaged_merged.client_id.alias("client_id"), "loyalty", "recency_index")     .na.drop('any', None, ["client_id"])     .na.fill({'loyalty': 0})

# Merge with interaction index
engaged_merged = engaged_merged     .join(interaction_index, engaged_merged.client_id == interaction_index.client_id, "outer")     .select(engaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index")     .na.drop('any', None, ["client_id"])     .na.fill({'interaction_index': 0})

# Merge with session duration index
engaged_merged = engaged_merged     .join(session_duration_index, engaged_merged.client_id == session_duration_index.client_id, "outer")     .select(engaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index")     .na.drop('any', None, ["client_id"])     .na.fill({'session_duration_index': 0})

# Merge with avg interactions
engaged_merged = engaged_merged     .join(avg_interactions, engaged_merged.client_id == avg_interactions.client_id, "outer")     .select(engaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index", "avg_interactions")     .na.drop('any', None, ["client_id"])     .na.fill({'avg_interactions': 0})

# Merge with neg interaction index
engaged_merged = engaged_merged     .join(neg_interaction_index, engaged_merged.client_id == neg_interaction_index.client_id, "outer")     .select(engaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index", "avg_interactions", "neg_interaction_index")     .na.drop('any', None, ["client_id"])     .na.fill({'neg_interaction_index': 0})


engaged_merged.show()
print engaged_merged.count()


# In[71]:

print engaged_merged.agg(avg(col("loyalty"))).take(1)
print engaged_merged.agg(avg(col("recency_index"))).take(1)
print engaged_merged.agg(avg(col("interaction_index"))).take(1)
print engaged_merged.agg(avg(col("neg_interaction_index"))).take(1)
print engaged_merged.agg(avg(col("session_duration_index"))).take(1)


# ## Generate Unengaged User Features Table

# In[27]:

# Merge with recency index
unengaged_merged = unengaged_users     .join(recency_index, unengaged_users.client_id == recency_index.client_id)     .drop(unengaged_users.client_id)

# Merge with loyalty index
unengaged_merged = unengaged_merged     .join(loyalty_index, unengaged_merged.client_id == loyalty_index.client_id, "outer")     .select(unengaged_merged.client_id.alias("client_id"), "loyalty", "recency_index")     .na.drop('any', None, ["client_id"])     .na.fill({'loyalty': 0})

# Merge with interaction index
unengaged_merged = unengaged_merged     .join(interaction_index, unengaged_merged.client_id == interaction_index.client_id, "outer")     .select(unengaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index")     .na.drop('any', None, ["client_id"])     .na.fill({'interaction_index': 0})

# Merge with session duration index
unengaged_merged = unengaged_merged     .join(session_duration_index, unengaged_merged.client_id == session_duration_index.client_id, "outer")     .select(unengaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index")     .na.drop('any', None, ["client_id"])     .na.fill({'session_duration_index': 0})

# Merge with avg interactions
unengaged_merged = unengaged_merged     .join(avg_interactions, unengaged_merged.client_id == avg_interactions.client_id, "outer")     .select(unengaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index", "avg_interactions")     .na.drop('any', None, ["client_id"])     .na.fill({'avg_interactions': 0})


# Merge with neg interaction index
unengaged_merged = unengaged_merged     .join(neg_interaction_index, unengaged_merged.client_id == neg_interaction_index.client_id, "outer")     .select(unengaged_merged.client_id.alias("client_id"), "loyalty", "recency_index", "interaction_index", "session_duration_index", "avg_interactions", "neg_interaction_index")     .na.drop('any', None, ["client_id"])     .na.fill({'neg_interaction_index': 0})

unengaged_merged.show()
print unengaged_merged.count()


# In[72]:

print unengaged_merged.agg(avg(col("loyalty"))).take(1)
print unengaged_merged.agg(avg(col("recency_index"))).take(1)
print unengaged_merged.agg(avg(col("interaction_index"))).take(1)
print unengaged_merged.agg(avg(col("neg_interaction_index"))).take(1)
print unengaged_merged.agg(avg(col("session_duration_index"))).take(1)


# In[42]:

engaged_subset = engaged_merged.randomSplit([1.0, 6.0])
print engaged_subset[0].count()
print engaged_subset[1].count()


# In[43]:

unengaged_subset = unengaged_merged.randomSplit([8.0, 20.0])
print unengaged_subset[0].count()
print unengaged_subset[1].count()


# In[44]:

engaged_data = []
unengaged_data = []

for row in engaged_subset[1].collect():
    engaged_data.append(LabeledPoint(0, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))

for row in unengaged_subset[0].collect():
    unengaged_data.append(LabeledPoint(1, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))


# In[45]:

engaged_test_data = []
unengaged_test_data = []

for row in engaged_subset[0].collect():
    engaged_test_data.append(LabeledPoint(0, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))

for row in unengaged_subset[1].collect():
    unengaged_test_data.append(LabeledPoint(1, [row.loyalty, row.recency_index, row.interaction_index, row.session_duration_index, row.neg_interaction_index]))


# ## Train an SVM

# In[46]:

data = sc.parallelize(engaged_data + unengaged_data)
mcm = SVMWithSGD.train(data, iterations=100, regParam=0.001)


# In[47]:

score_and_labels = []

test_correct_engaged = 0
for item in engaged_test_data:
    prediction = mcm.predict(item.features)
    if prediction == item.label:
        test_correct_engaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Engaged Accuracy: " + str(test_correct_engaged / float(len(engaged_test_data)))

test_correct_unengaged = 0
for item in unengaged_test_data:
    prediction = mcm.predict(item.features)
    if prediction == item.label:
        test_correct_unengaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Unengaged Accuracy: " + str(test_correct_unengaged / float(len(unengaged_test_data)))

print "Total Accuracy: " + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))

metrics = MulticlassMetrics(sc.parallelize(score_and_labels))

# Statistics by class
labels = [float(1), float(0)]
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))


# ## Train Logistic Regression

# In[66]:

lr = LogisticRegressionWithSGD.train(data)


# In[67]:

score_and_labels = []

test_correct_engaged = 0
for item in engaged_test_data:
    prediction = lr.predict(item.features)
    if prediction == item.label:
        test_correct_engaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Engaged Accuracy: " + str(test_correct_engaged / float(len(engaged_test_data)))

test_correct_unengaged = 0
for item in unengaged_test_data:
    prediction = lr.predict(item.features)
    if prediction == item.label:
        test_correct_unengaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Unengaged Accuracy: " + str(test_correct_unengaged / float(len(unengaged_test_data)))

print "Total Accuracy: " + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))

metrics = MulticlassMetrics(sc.parallelize(score_and_labels))

# Statistics by class
labels = [float(1), float(0)]
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))


# In[ ]:


# In[50]:

dt = DecisionTree.trainClassifier(data, 2, {})


# In[51]:

score_and_labels = []

test_correct_engaged = 0
for item in engaged_test_data:
    prediction = dt.predict(item.features)
    if prediction == item.label:
        test_correct_engaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Engaged Accuracy: " + str(test_correct_engaged / float(len(engaged_test_data)))

test_correct_unengaged = 0
for item in unengaged_test_data:
    prediction = dt.predict(item.features)
    if prediction == item.label:
        test_correct_unengaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Unengaged Accuracy: " + str(test_correct_unengaged / float(len(unengaged_test_data)))

print "Total Accuracy: " + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))

metrics = MulticlassMetrics(sc.parallelize(score_and_labels))

# Statistics by class
labels = [float(1), float(0)]
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))


# In[63]:

rf = RandomForest.trainClassifier(data, 2, {}, numTrees=1000)


# In[64]:

score_and_labels = []

test_correct_engaged = 0
for item in engaged_test_data:
    prediction = rf.predict(item.features)
    if prediction == item.label:
        test_correct_engaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Engaged Accuracy: " + str(test_correct_engaged / float(len(engaged_test_data)))

test_correct_unengaged = 0
for item in unengaged_test_data:
    prediction = rf.predict(item.features)
    if prediction == item.label:
        test_correct_unengaged += 1
    score_and_labels.append((float(prediction), item.label))

print "Unengaged Accuracy: " + str(test_correct_unengaged / float(len(unengaged_test_data)))

print "Total Accuracy: " + str((test_correct_engaged + test_correct_unengaged) / float(len(unengaged_test_data) + len(engaged_test_data)))

metrics = MulticlassMetrics(sc.parallelize(score_and_labels))

# Statistics by class
labels = [float(1), float(0)]
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))


# ## Let's see if there is a correlation between average number of interactions and the various engagement metrics

# In[ ]:

print engaged_merged.corr("recency_index", "avg_interactions")
print unengaged_merged.corr("recency_index", "avg_interactions")


# In[ ]:

print engaged_merged.corr("loyalty", "avg_interactions")
print unengaged_merged.corr("loyalty", "avg_interactions")


# In[ ]:

print engaged_merged.corr("session_duration_index", "avg_interactions")
print unengaged_merged.corr("session_duration_index", "avg_interactions")


# In[ ]:

print engaged_merged.corr("interaction_index", "avg_interactions")
print unengaged_merged.corr("interaction_index", "avg_interactions")


# In[ ]:


# In[ ]:

print engaged_merged.corr("interaction_index", "loyalty")
print unengaged_merged.corr("interaction_index", "loyalty")


# In[ ]:

print engaged_merged.corr("interaction_index", "recency_index")
print unengaged_merged.corr("interaction_index", "recency_index")


# In[ ]:

print engaged_merged.corr("interaction_index", "session_duration_index")
print unengaged_merged.corr("interaction_index", "session_duration_index")


# In[ ]:

print engaged_merged.corr("interaction_index", "neg_interaction_index")
print unengaged_merged.corr("interaction_index", "neg_interaction_index")


# In[ ]:

print engaged_merged.corr("loyalty", "recency_index")
print unengaged_merged.corr("loyalty", "recency_index")


# In[ ]:

print engaged_merged.corr("loyalty", "session_duration_index")
print unengaged_merged.corr("loyalty", "session_duration_index")


# In[ ]:

print engaged_merged.corr("loyalty", "neg_interaction_index")
print unengaged_merged.corr("loyalty", "neg_interaction_index")


# In[ ]:

print engaged_merged.corr("recency_index", "session_duration_index")
print unengaged_merged.corr("recency_index", "session_duration_index")


# In[ ]:

print engaged_merged.corr("recency_index", "neg_interaction_index")
print unengaged_merged.corr("recency_index", "neg_interaction_index")


# In[ ]:

print engaged_merged.corr("session_duration_index", "neg_interaction_index")
print unengaged_merged.corr("session_duration_index", "neg_interaction_index")


# In[ ]:


# In[ ]: