Created
June 18, 2019 13:51
-
-
Save DanielaLaura/c10c462195f2ff812bb8ff99dd31fac4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def label_customer(customer_id,logins, prediction_date, churn_days, return_trans = False):\n", | |
" \"\"\"\n", | |
" Make label times for a single customer. Returns a dataframe of labels with times, the binary label, \n", | |
" and the number of days until the next churn.\n", | |
" \n", | |
" Params\n", | |
" --------\n", | |
" customer_id (str): unique id for the customer\n", | |
" customer_logs (dataframe): logs dataframe for the customer\n", | |
" prediction_date (str): time at which predictions are made. Either \"MS\" for the first of the month\n", | |
" or \"SMS\" for the first and fifteenth of each month \n", | |
" churn_days (int): integer number of days without an active membership required for a churn. A churn is\n", | |
" defined by exceeding this number of 5 weeks without being an active player.\n", | |
" lead_time (int): number of periods in advance to make predictions for. 3 weeks\n", | |
" prediction_window(int): number of periods over which to consider churn. 5 weeks\n", | |
" return_trans (boolean): whether or not to return the transactions for analysis. Defaults to False.\n", | |
" \n", | |
" Return\n", | |
" --------\n", | |
" label_times (dataframe): a table of customer id, the cutoff times at the specified frequency, the \n", | |
" label for each cutoff time, the number of days until the next churn for each\n", | |
" cutoff time, and the date on which the churn itself occurred.\n", | |
" \n", | |
" \n", | |
" \"\"\"\n", | |
" \n", | |
" assert(prediction_date in ['MS', 'SMS']), \"Prediction day must be either 'MS' or 'SMS'\"\n", | |
" assert(logins['actor_account_id'].unique() == [customer_id]), \"Transactions must be for only customer\"\n", | |
" \n", | |
" # Don't modify original\n", | |
" logs = logins.copy()\n", | |
" \n", | |
" #Range for cutoff times is from first to last log\n", | |
" first_log = logs['min_time']\n", | |
" last_log = logs['max_time']\n", | |
" start_date = first_log\n", | |
" # pd.datetime(first_log.month, first_log.day)\n", | |
" \n", | |
" # Find number of days between last log and cutoff\n", | |
" \n", | |
" logs['difference_days'] = logs['cutof']- logs['max_time']\n", | |
" # Determine which actor are associated with a churn\n", | |
" logs['churn'] = logs['difference_days'].astype('timedelta64[D]') > churn_days\n", | |
" logs['last_day_active']=last_log\n", | |
" \n", | |
" # Find date of each churn\n", | |
" logs.loc[logs['churn'] == True, \n", | |
" 'churn_date'] = logs.loc[logs['churn'] == True, \n", | |
" 'last_day_active'] + pd.Timedelta(churn_days, 'd')\n", | |
" \n", | |
" #time labels\n", | |
" label_times=pd.DataFrame({'actor_account_id':[customer_id]})\n", | |
" label_times.insert(1, \"cutoff_time\", \"05-11-2016\")\n", | |
" label_times['cutoff_time'] = label_times['cutoff_time'].astype('datetime64[ns]')\n", | |
"\n", | |
"\n", | |
" lead_time=21\n", | |
" prediction_window=35\n", | |
" # Use the lead time and prediction window parameters to establish the prediction window \n", | |
" # Prediction window is for each cutoff time\n", | |
" label_times['prediction_window_start'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd')\n", | |
" label_times['prediction_window_end'] = label_times['cutoff_time'] + pd.Timedelta(lead_time, 'd') + pd.Timedelta(prediction_window, 'd')\n", | |
" \n", | |
" previous_churn_date = None\n", | |
" \n", | |
" #when no churn\n", | |
" if (logs['churn']==False).all():\n", | |
" label_times['label']=0\n", | |
" label_times['days_to_churn']=np.nan\n", | |
" label_times['churn_date']=np.nan\n", | |
" if return_trans:\n", | |
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], logs\n", | |
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]\n", | |
"\n", | |
" # Iterate through every cutoff time\n", | |
" for i, row in label_times.iterrows():\n", | |
" \n", | |
" # Default values if unknown\n", | |
" churn_date = pd.NaT\n", | |
" label = np.nan\n", | |
" # Find the window start and end\n", | |
" window_start = row['prediction_window_start']\n", | |
" window_end = row['prediction_window_end']\n", | |
" # Determine if there were any churns during the prediction window\n", | |
" churns = logs.loc[(logs['churn_date'] >= window_start) & \n", | |
" (logs['churn_date'] < window_end), 'churn_date']\n", | |
"\n", | |
" # Positive label if there was a churn during window\n", | |
" if not churns.empty:\n", | |
" label = 1\n", | |
" churn_date = churns.values[0]\n", | |
"\n", | |
" # Find number of days until next churn by \n", | |
" # subsetting to cutoff times before current churn and after previous churns\n", | |
" if not previous_churn_date:\n", | |
" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date)].index\n", | |
" else:\n", | |
" before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & \n", | |
" (label_times['cutoff_time'] > previous_churn_date)].index\n", | |
"\n", | |
" # Calculate days to next churn for cutoff times before current churn\n", | |
" label_times.loc[before_idx, 'days_to_churn'] = (churn_date - label_times.loc[before_idx, \n", | |
" 'cutoff_time']).\\\n", | |
" dt.total_seconds() / (3600 * 24)\n", | |
" previous_churn_date = churn_date\n", | |
" # No churns, but need to determine if an active member\n", | |
" else:\n", | |
" # Find transactions before the end of the window that were not cancelled ### here the carracter deleted\n", | |
" logs_before = logs.loc[(logs['time'] < window_end)].copy()\n", | |
" # If the membership expiration date for this membership is after the window start, the custom has not churned\n", | |
" if np.any(logs_before['time'] >= window_start):\n", | |
" label = 0\n", | |
"\n", | |
" # Assign values\n", | |
" label_times.loc[i, 'label'] = label\n", | |
" label_times.loc[i, 'churn_date'] = churn_date\n", | |
" \n", | |
" # Handle case with no churns\n", | |
" if not np.any(label_times['label'] == 1):\n", | |
" label_times['days_to_churn'] = np.nan\n", | |
" label_times['churn_date'] = pd.NaT\n", | |
" \n", | |
" if return_trans:\n", | |
" return label_times.drop(columns = ['actor_account_id']), logs\n", | |
" \n", | |
" return label_times[['actor_account_id', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']].copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>actor_account_id</th>\n", | |
" <th>cutoff_time</th>\n", | |
" <th>label</th>\n", | |
" <th>days_to_churn</th>\n", | |
" <th>churn_date</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>003B1E06</td>\n", | |
" <td>2016-05-11</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" actor_account_id cutoff_time label days_to_churn churn_date\n", | |
"0 003B1E06 2016-05-11 0 NaN NaN" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#customer_id = train.iloc[0, 0]\n", | |
"customer_logs = train.loc[train['actor_account_id'] == customer_id].copy()\n", | |
"\n", | |
"\n", | |
"label_times = label_customer(customer_id, customer_logs, 'MS', 35, False)\n", | |
"label_times" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment