DanielaLaura/gist3.ipynb

## gist3.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "def label_customer(customer_id,logins, prediction_date, churn_days,   return_trans = False):\n",
    "    \"\"\"\n",
    "    Make label times for a single customer. Returns a dataframe of labels with times, the binary label, \n",
    "    and the number of days until the next churn.\n",
    "       \n",
    "    Params\n",
    "    --------\n",
    "        customer_id (str): unique id for the customer\n",
    "        customer_logs (dataframe): logs dataframe for the customer\n",
    "        prediction_date (str): time at which predictions are made. Either \"MS\" for the first of the month\n",
    "                               or \"SMS\" for the first and fifteenth of each month \n",
    "        churn_days (int): integer number of days without an active membership required for a churn. A churn is\n",
    "                          defined by exceeding this number of 5 weeks  without being an active player.\n",
    "        lead_time (int): number of periods in advance to make predictions for. 3 weeks\n",
    "        prediction_window(int): number of periods over which to consider churn. 5 weeks\n",
    "        return_trans (boolean): whether or not to return the transactions for analysis. Defaults to False.\n",
    "        \n",
    "    Return\n",
    "    --------\n",
    "        label_times (dataframe): a table of customer id, the cutoff times at the specified frequency, the \n",
    "                                 label for each cutoff time, the number of days until the next churn for each\n",
    "                                 cutoff time, and the date on which the churn itself occurred.\n",
    "        \n",
    "    \n",
    "       \"\"\"\n",
    "    \n",
    "    assert(prediction_date in ['MS', 'SMS']), \"Prediction day must be either 'MS' or 'SMS'\"\n",
    "    assert(logins['actor_account_id'].unique() == [customer_id]), \"Transactions must be for only customer\"\n",
    "    \n",
    "    # Don't modify original\n",
    "    logs = logins.copy()\n",
    "    \n",
    "    #Range for cutoff times is from first to last  log\n",
    "    first_log = logs['min_time']\n",
    "    last_log = logs['max_time']\n",
    "    start_date = first_log\n",
    "   # pd.datetime(first_log.month, first_log.day)\n",
    "   \n",
    "    # Find number of days between last log and cutoff\n",
    "    \n",
    "    logs['difference_days'] = logs['cutof']- logs['max_time']\n",
    "    # Determine which actor are associated with a churn\n",
    "    logs['churn'] = logs['difference_days'].astype('timedelta64[D]') > churn_days\n",
    "    logs['last_day_active']=last_log\n",
    "    \n",
    "    # Find date of each churn\n",
    "    logs.loc[logs['churn'] == True, \n",
    "                     'churn_date'] = logs.loc[logs['churn'] == True, \n",
    "                                                      'last_day_active'] + pd.Timedelta(churn_days, 'd')\n",
    "    \n",
    "    #time labels\n",
    "    label_times=pd.DataFrame({'actor_account_id':[customer_id]})\n",
    "    label_times.insert(1, \"cutoff_time\", \"05-11-2016\")\n",
    "    label_times['cutoff_time'] = label_times['cutoff_time'].astype('datetime64[ns]')\n",
    "\n",
    "\n",
    "    lead_time=21\n",
    "    prediction_window=35\n",
    "    # Use the lead time and prediction window parameters to establish the prediction window \n",
    "    # Prediction window is for each cutoff time\n",
    "    label_times['prediction_window_start'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd')\n",
    "    label_times['prediction_window_end'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd') + pd.Timedelta(prediction_window, 'd')\n",
    "    \n",
    "    previous_churn_date = None\n",
    "    \n",
    "    #when no churn\n",
    "    if (logs['churn']==False).all():\n",
    "        label_times['label']=0\n",
    "        label_times['days_to_churn']=np.nan\n",
    "        label_times['churn_date']=np.nan\n",
    "        if return_trans:\n",
    "            return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], logs\n",
    "        return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]\n",
    "\n",
    "    # Iterate through every cutoff time\n",
    "    for i, row in label_times.iterrows():\n",
    "        \n",
    "        # Default values if unknown\n",
    "        churn_date = pd.NaT\n",
    "        label = np.nan\n",
    "        # Find the window start and end\n",
    "        window_start = row['prediction_window_start']\n",
    "        window_end = row['prediction_window_end']\n",
    "        # Determine if there were any churns during the prediction window\n",
    "        churns = logs.loc[(logs['churn_date'] >= window_start) & \n",
    "                                  (logs['churn_date'] < window_end), 'churn_date']\n",
    "\n",
    "        # Positive label if there was a churn during window\n",
    "        if not churns.empty:\n",
    "            label = 1\n",
    "            churn_date = churns.values[0]\n",
    "\n",
    "            # Find number of days until next churn by \n",
    "            # subsetting to cutoff times before current churn and after previous churns\n",
    "            if not previous_churn_date:\n",
    "                before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date)].index\n",
    "            else:\n",
    "                before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & \n",
    "                                             (label_times['cutoff_time'] > previous_churn_date)].index\n",
    "\n",
    "            # Calculate days to next churn for cutoff times before current churn\n",
    "            label_times.loc[before_idx, 'days_to_churn'] = (churn_date - label_times.loc[before_idx, \n",
    "                                                                                         'cutoff_time']).\\\n",
    "                                                            dt.total_seconds() / (3600 * 24)\n",
    "            previous_churn_date = churn_date\n",
    "        # No churns, but need to determine if an active member\n",
    "        else:\n",
    "            # Find transactions before the end of the window that were not cancelled ### here the carracter deleted\n",
    "            logs_before = logs.loc[(logs['time'] < window_end)].copy()\n",
    "            # If the membership expiration date for this membership is after the window start, the custom has not churned\n",
    "            if np.any(logs_before['time'] >= window_start):\n",
    "                label = 0\n",
    "\n",
    "        # Assign values\n",
    "        label_times.loc[i, 'label'] = label\n",
    "        label_times.loc[i, 'churn_date'] = churn_date\n",
    "        \n",
    "        # Handle case with no churns\n",
    "        if not np.any(label_times['label'] == 1):\n",
    "            label_times['days_to_churn'] = np.nan\n",
    "            label_times['churn_date'] = pd.NaT\n",
    "        \n",
    "    if return_trans:\n",
    "        return label_times.drop(columns = ['actor_account_id']), logs\n",
    "    \n",
    "    return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actor_account_id</th>\n",
       "      <th>cutoff_time</th>\n",
       "      <th>label</th>\n",
       "      <th>days_to_churn</th>\n",
       "      <th>churn_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>003B1E06</td>\n",
       "      <td>2016-05-11</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  actor_account_id cutoff_time  label  days_to_churn  churn_date\n",
       "0         003B1E06  2016-05-11      0            NaN         NaN"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#customer_id = train.iloc[0, 0]\n",
    "customer_logs = train.loc[train['actor_account_id'] == customer_id].copy()\n",
    "\n",
    "\n",
    "label_times = label_customer(customer_id, customer_logs, 'MS', 35,   False)\n",
    "label_times"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [],
	"source": [
	"def label_customer(customer_id,logins, prediction_date, churn_days, return_trans = False):\n",
	" \"\"\"\n",
	" Make label times for a single customer. Returns a dataframe of labels with times, the binary label, \n",
	" and the number of days until the next churn.\n",
	" \n",
	" Params\n",
	" --------\n",
	" customer_id (str): unique id for the customer\n",
	" customer_logs (dataframe): logs dataframe for the customer\n",
	" prediction_date (str): time at which predictions are made. Either \"MS\" for the first of the month\n",
	" or \"SMS\" for the first and fifteenth of each month \n",
	" churn_days (int): integer number of days without an active membership required for a churn. A churn is\n",
	" defined by exceeding this number of 5 weeks without being an active player.\n",
	" lead_time (int): number of periods in advance to make predictions for. 3 weeks\n",
	" prediction_window(int): number of periods over which to consider churn. 5 weeks\n",
	" return_trans (boolean): whether or not to return the transactions for analysis. Defaults to False.\n",
	" \n",
	" Return\n",
	" --------\n",
	" label_times (dataframe): a table of customer id, the cutoff times at the specified frequency, the \n",
	" label for each cutoff time, the number of days until the next churn for each\n",
	" cutoff time, and the date on which the churn itself occurred.\n",
	" \n",
	" \n",
	" \"\"\"\n",
	" \n",
	" assert(prediction_date in ['MS', 'SMS']), \"Prediction day must be either 'MS' or 'SMS'\"\n",
	" assert(logins['actor_account_id'].unique() == [customer_id]), \"Transactions must be for only customer\"\n",
	" \n",
	" # Don't modify original\n",
	" logs = logins.copy()\n",
	" \n",
	" #Range for cutoff times is from first to last log\n",
	" first_log = logs['min_time']\n",
	" last_log = logs['max_time']\n",
	" start_date = first_log\n",
	" # pd.datetime(first_log.month, first_log.day)\n",
	" \n",
	" # Find number of days between last log and cutoff\n",
	" \n",
	" logs['difference_days'] = logs['cutof']- logs['max_time']\n",
	" # Determine which actor are associated with a churn\n",
	" logs['churn'] = logs['difference_days'].astype('timedelta64[D]') > churn_days\n",
	" logs['last_day_active']=last_log\n",
	" \n",
	" # Find date of each churn\n",
	" logs.loc[logs['churn'] == True, \n",
	" 'churn_date'] = logs.loc[logs['churn'] == True, \n",
	" 'last_day_active'] + pd.Timedelta(churn_days, 'd')\n",
	" \n",
	" #time labels\n",
	" label_times=pd.DataFrame({'actor_account_id':[customer_id]})\n",
	" label_times.insert(1, \"cutoff_time\", \"05-11-2016\")\n",
	" label_times['cutoff_time'] = label_times['cutoff_time'].astype('datetime64[ns]')\n",
	"\n",
	"\n",
	" lead_time=21\n",
	" prediction_window=35\n",
	" # Use the lead time and prediction window parameters to establish the prediction window \n",
	" # Prediction window is for each cutoff time\n",
	" label_times['prediction_window_start'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd')\n",
	" label_times['prediction_window_end'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd') + pd.Timedelta(prediction_window, 'd')\n",
	" \n",
	" previous_churn_date = None\n",
	" \n",
	" #when no churn\n",
	" if (logs['churn']==False).all():\n",
	" label_times['label']=0\n",
	" label_times['days_to_churn']=np.nan\n",
	" label_times['churn_date']=np.nan\n",
	" if return_trans:\n",
	" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], logs\n",
	" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]\n",
	"\n",
	" # Iterate through every cutoff time\n",
	" for i, row in label_times.iterrows():\n",
	" \n",
	" # Default values if unknown\n",
	" churn_date = pd.NaT\n",
	" label = np.nan\n",
	" # Find the window start and end\n",
	" window_start = row['prediction_window_start']\n",
	" window_end = row['prediction_window_end']\n",
	" # Determine if there were any churns during the prediction window\n",
	" churns = logs.loc[(logs['churn_date'] >= window_start) & \n",
	" (logs['churn_date'] < window_end), 'churn_date']\n",
	"\n",
	" # Positive label if there was a churn during window\n",
	" if not churns.empty:\n",
	" label = 1\n",
	" churn_date = churns.values[0]\n",
	"\n",
	" # Find number of days until next churn by \n",
	" # subsetting to cutoff times before current churn and after previous churns\n",
	" if not previous_churn_date:\n",
	" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date)].index\n",
	" else:\n",
	" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & \n",
	" (label_times['cutoff_time'] > previous_churn_date)].index\n",
	"\n",
	" # Calculate days to next churn for cutoff times before current churn\n",
	" label_times.loc[before_idx, 'days_to_churn'] = (churn_date - label_times.loc[before_idx, \n",
	" 'cutoff_time']).\\\n",
	" dt.total_seconds() / (3600 * 24)\n",
	" previous_churn_date = churn_date\n",
	" # No churns, but need to determine if an active member\n",
	" else:\n",
	" # Find transactions before the end of the window that were not cancelled ### here the carracter deleted\n",
	" logs_before = logs.loc[(logs['time'] < window_end)].copy()\n",
	" # If the membership expiration date for this membership is after the window start, the custom has not churned\n",
	" if np.any(logs_before['time'] >= window_start):\n",
	" label = 0\n",
	"\n",
	" # Assign values\n",
	" label_times.loc[i, 'label'] = label\n",
	" label_times.loc[i, 'churn_date'] = churn_date\n",
	" \n",
	" # Handle case with no churns\n",
	" if not np.any(label_times['label'] == 1):\n",
	" label_times['days_to_churn'] = np.nan\n",
	" label_times['churn_date'] = pd.NaT\n",
	" \n",
	" if return_trans:\n",
	" return label_times.drop(columns = ['actor_account_id']), logs\n",
	" \n",
	" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']].copy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>actor_account_id</th>\n",
	" <th>cutoff_time</th>\n",
	" <th>label</th>\n",
	" <th>days_to_churn</th>\n",
	" <th>churn_date</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>003B1E06</td>\n",
	" <td>2016-05-11</td>\n",
	" <td>0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" actor_account_id cutoff_time label days_to_churn churn_date\n",
	"0 003B1E06 2016-05-11 0 NaN NaN"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#customer_id = train.iloc[0, 0]\n",
	"customer_logs = train.loc[train['actor_account_id'] == customer_id].copy()\n",
	"\n",
	"\n",
	"label_times = label_customer(customer_id, customer_logs, 'MS', 35, False)\n",
	"label_times"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	},
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}