Skip to content

Instantly share code, notes, and snippets.

@DanielaLaura
Created June 18, 2019 13:51
Show Gist options
  • Save DanielaLaura/c10c462195f2ff812bb8ff99dd31fac4 to your computer and use it in GitHub Desktop.
Save DanielaLaura/c10c462195f2ff812bb8ff99dd31fac4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def label_customer(customer_id,logins, prediction_date, churn_days, return_trans = False):\n",
" \"\"\"\n",
" Make label times for a single customer. Returns a dataframe of labels with times, the binary label, \n",
" and the number of days until the next churn.\n",
" \n",
" Params\n",
" --------\n",
" customer_id (str): unique id for the customer\n",
" customer_logs (dataframe): logs dataframe for the customer\n",
" prediction_date (str): time at which predictions are made. Either \"MS\" for the first of the month\n",
" or \"SMS\" for the first and fifteenth of each month \n",
" churn_days (int): integer number of days without an active membership required for a churn. A churn is\n",
" defined by exceeding this number of 5 weeks without being an active player.\n",
" lead_time (int): number of periods in advance to make predictions for. 3 weeks\n",
" prediction_window(int): number of periods over which to consider churn. 5 weeks\n",
" return_trans (boolean): whether or not to return the transactions for analysis. Defaults to False.\n",
" \n",
" Return\n",
" --------\n",
" label_times (dataframe): a table of customer id, the cutoff times at the specified frequency, the \n",
" label for each cutoff time, the number of days until the next churn for each\n",
" cutoff time, and the date on which the churn itself occurred.\n",
" \n",
" \n",
" \"\"\"\n",
" \n",
" assert(prediction_date in ['MS', 'SMS']), \"Prediction day must be either 'MS' or 'SMS'\"\n",
" assert(logins['actor_account_id'].unique() == [customer_id]), \"Transactions must be for only customer\"\n",
" \n",
" # Don't modify original\n",
" logs = logins.copy()\n",
" \n",
" #Range for cutoff times is from first to last log\n",
" first_log = logs['min_time']\n",
" last_log = logs['max_time']\n",
" start_date = first_log\n",
" # pd.datetime(first_log.month, first_log.day)\n",
" \n",
" # Find number of days between last log and cutoff\n",
" \n",
" logs['difference_days'] = logs['cutof']- logs['max_time']\n",
" # Determine which actor are associated with a churn\n",
" logs['churn'] = logs['difference_days'].astype('timedelta64[D]') > churn_days\n",
" logs['last_day_active']=last_log\n",
" \n",
" # Find date of each churn\n",
" logs.loc[logs['churn'] == True, \n",
" 'churn_date'] = logs.loc[logs['churn'] == True, \n",
" 'last_day_active'] + pd.Timedelta(churn_days, 'd')\n",
" \n",
" #time labels\n",
" label_times=pd.DataFrame({'actor_account_id':[customer_id]})\n",
" label_times.insert(1, \"cutoff_time\", \"05-11-2016\")\n",
" label_times['cutoff_time'] = label_times['cutoff_time'].astype('datetime64[ns]')\n",
"\n",
"\n",
" lead_time=21\n",
" prediction_window=35\n",
" # Use the lead time and prediction window parameters to establish the prediction window \n",
" # Prediction window is for each cutoff time\n",
" label_times['prediction_window_start'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd')\n",
" label_times['prediction_window_end'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd') + pd.Timedelta(prediction_window, 'd')\n",
" \n",
" previous_churn_date = None\n",
" \n",
" #when no churn\n",
" if (logs['churn']==False).all():\n",
" label_times['label']=0\n",
" label_times['days_to_churn']=np.nan\n",
" label_times['churn_date']=np.nan\n",
" if return_trans:\n",
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], logs\n",
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]\n",
"\n",
" # Iterate through every cutoff time\n",
" for i, row in label_times.iterrows():\n",
" \n",
" # Default values if unknown\n",
" churn_date = pd.NaT\n",
" label = np.nan\n",
" # Find the window start and end\n",
" window_start = row['prediction_window_start']\n",
" window_end = row['prediction_window_end']\n",
" # Determine if there were any churns during the prediction window\n",
" churns = logs.loc[(logs['churn_date'] >= window_start) & \n",
" (logs['churn_date'] < window_end), 'churn_date']\n",
"\n",
" # Positive label if there was a churn during window\n",
" if not churns.empty:\n",
" label = 1\n",
" churn_date = churns.values[0]\n",
"\n",
" # Find number of days until next churn by \n",
" # subsetting to cutoff times before current churn and after previous churns\n",
" if not previous_churn_date:\n",
" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date)].index\n",
" else:\n",
" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & \n",
" (label_times['cutoff_time'] > previous_churn_date)].index\n",
"\n",
" # Calculate days to next churn for cutoff times before current churn\n",
" label_times.loc[before_idx, 'days_to_churn'] = (churn_date - label_times.loc[before_idx, \n",
" 'cutoff_time']).\\\n",
" dt.total_seconds() / (3600 * 24)\n",
" previous_churn_date = churn_date\n",
" # No churns, but need to determine if an active member\n",
" else:\n",
" # Find transactions before the end of the window that were not cancelled ### here the carracter deleted\n",
" logs_before = logs.loc[(logs['time'] < window_end)].copy()\n",
" # If the membership expiration date for this membership is after the window start, the custom has not churned\n",
" if np.any(logs_before['time'] >= window_start):\n",
" label = 0\n",
"\n",
" # Assign values\n",
" label_times.loc[i, 'label'] = label\n",
" label_times.loc[i, 'churn_date'] = churn_date\n",
" \n",
" # Handle case with no churns\n",
" if not np.any(label_times['label'] == 1):\n",
" label_times['days_to_churn'] = np.nan\n",
" label_times['churn_date'] = pd.NaT\n",
" \n",
" if return_trans:\n",
" return label_times.drop(columns = ['actor_account_id']), logs\n",
" \n",
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']].copy()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>actor_account_id</th>\n",
" <th>cutoff_time</th>\n",
" <th>label</th>\n",
" <th>days_to_churn</th>\n",
" <th>churn_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>003B1E06</td>\n",
" <td>2016-05-11</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" actor_account_id cutoff_time label days_to_churn churn_date\n",
"0 003B1E06 2016-05-11 0 NaN NaN"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#customer_id = train.iloc[0, 0]\n",
"customer_logs = train.loc[train['actor_account_id'] == customer_id].copy()\n",
"\n",
"\n",
"label_times = label_customer(customer_id, customer_logs, 'MS', 35, False)\n",
"label_times"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment