Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
percentiles = df['arrival_delay_deciles'].apply(pd.Series) | |
percentiles.rename(columns = lambda x : '{0}%'.format(x*10), inplace=True) | |
percentiles.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to calculate feature engineering by aggregating integer columns | |
def get_agg_by_cols(data, col, agg_list, agg_cols): | |
""" | |
data : dataframe to calculate FE on | |
col : list of columns to aggregate by (the object datatype) | |
agg_list : a list of statistical measure e.g mean, median | |
agg_cols : a list of numerical datatype columns | |
""" | |
for cols in col: | |
for i in agg_list: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import KFold, StratifiedKFold | |
from catboost import CatBoostClassifier | |
from sklearn.metrics import roc_auc_score | |
kfold, scores, y_pred_totcb = StratifiedKFold(n_splits=5, shuffle=True, random_state=221), list(), [] | |
for train, test in kfold.split(X, y): | |
x_train, x_test = X.iloc[train], X.iloc[test] | |
y_train, y_test = y.iloc[train], y.iloc[test] | |
model = CatBoostClassifier(random_state=27, n_estimators=3000, cat_features = cat_columns, | |
max_depth=7, verbose=500, learning_rate=0.102, eval_metric='AUC') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_system_generated_msgs(uploaded_file: str) -> list: | |
"""Remove system generated messages like:: | |
1. +234 was added | |
2. +234 left etc | |
uploaded_file: str: path of dataset | |
Returns: | |
-------- | |
list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#removing emojis | |
def remove_emoji(text: str) -> str: | |
""" | |
remove emojis from chats i think idont need | |
args: | |
---- | |
text: a single list of message | |
Returns: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def processChatMsgs(wh_chat: list) -> pd.DataFrame: | |
""" | |
process the uploaded chat data by removing unwanted | |
entries and returns a dataframe | |
args: | |
----- | |
wh_chat: whatsapp chat data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_total_msg(user:str, df:pd.DataFrame) -> str: | |
""" | |
Return the total msgs 'user' has sent | |
Args: | |
---- | |
user: the user to extract his/her total count of messages | |
df: pd.DataFrame: dataframe that stores information about each user and message sent | |
Returns: | |
-------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
def draw_wordcloud(msgs: list): | |
""" | |
Draw wordcloud for visualization of the most used words | |
during conversation | |
args: | |
----- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pie_chart(user): | |
fig, ax = plt.subplots(figsize=(15, 8)) | |
explodex = [] | |
for i in np.arange(len(user)): | |
explodex.append(0.005) | |
ax = user.plot(kind='pie', colors=['red', 'green', 'cyan', 'lime', 'gold'], | |
fontsize=12, autopct='%1.1f%%', startangle=180, | |
pctdistance=0.85, explode=explodex) | |
inner_circle = plt.Circle((0,0), 0.50, fc='white') | |
fig = plt.gcf() |
OlderNewer