Abubakar Alaro Abuton

## Assignment.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Abuton
                / Assignment.ipynb
            
            
              Created
              January 5, 2021 11:04
            
              
                Final Assignment on Scalable ML using pyspark
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## convert_arr_series_2_dataframe.py
import pandas as pd

percentiles = df['arrival_delay_deciles'].apply(pd.Series)
percentiles.rename(columns = lambda x : '{0}%'.format(x*10), inplace=True)
percentiles.head()

## get_agg_by_cols.py
# function to calculate feature engineering by aggregating integer columns
def get_agg_by_cols(data, col, agg_list, agg_cols):
    """
    data : dataframe to calculate FE on
    col : list of columns to aggregate by (the object datatype)
    agg_list : a list of statistical measure e.g mean, median
    agg_cols : a list of numerical datatype columns
    """
    for cols in col:
        for i in agg_list:

## stratifiedkfold.py
from sklearn.model_selection import KFold, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
kfold, scores, y_pred_totcb = StratifiedKFold(n_splits=5, shuffle=True, random_state=221), list(), []
for train, test in kfold.split(X, y):
    x_train, x_test = X.iloc[train], X.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]

    model = CatBoostClassifier(random_state=27,  n_estimators=3000, cat_features = cat_columns,
                               max_depth=7, verbose=500, learning_rate=0.102, eval_metric='AUC')

## remove_system_generated_msgs.py
def remove_system_generated_msgs(uploaded_file: str) -> list:
  """Remove system generated messages like::
      1. +234 was added
      2. +234 left etc

      uploaded_file: str: path of dataset

	   Returns:
     --------
     list

## remove_emoji.py
 #removing emojis
def remove_emoji(text: str) -> str:
    """
    remove emojis from chats i think idont need

    args:
    ----
    text: a single list of message

    Returns:

## process_text.py
import pandas as pd

def processChatMsgs(wh_chat: list) -> pd.DataFrame:
  """
	process the uploaded chat data by removing unwanted
	entries and returns a dataframe

	args:
	-----
	wh_chat: whatsapp chat data

## get_total_msg.py
def get_total_msg(user:str, df:pd.DataFrame) -> str:
  """
  Return the total msgs 'user' has sent
  Args:
  ----
  user: the user to extract his/her total count of messages
  df: pd.DataFrame: dataframe that stores information about each user and message sent

  Returns:
  --------

## draw_wordcloud.py
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def draw_wordcloud(msgs: list):
	"""
	Draw wordcloud for visualization of the most used words
	during conversation
	args:
	-----

## pie_chart.py
def pie_chart(user):
  fig, ax = plt.subplots(figsize=(15, 8))
  explodex = []
  for i in np.arange(len(user)):
      explodex.append(0.005)
  ax = user.plot(kind='pie', colors=['red', 'green', 'cyan', 'lime', 'gold'],
                 fontsize=12, autopct='%1.1f%%', startangle=180,
                pctdistance=0.85, explode=explodex)
  inner_circle = plt.Circle((0,0), 0.50, fc='white')
  fig = plt.gcf()
	import pandas as pd

	percentiles = df['arrival_delay_deciles'].apply(pd.Series)
	percentiles.rename(columns = lambda x : '{0}%'.format(x*10), inplace=True)
	percentiles.head()
	# function to calculate feature engineering by aggregating integer columns
	def get_agg_by_cols(data, col, agg_list, agg_cols):
	"""
	data : dataframe to calculate FE on
	col : list of columns to aggregate by (the object datatype)
	agg_list : a list of statistical measure e.g mean, median
	agg_cols : a list of numerical datatype columns
	"""
	for cols in col:
	for i in agg_list:
	from sklearn.model_selection import KFold, StratifiedKFold
	from catboost import CatBoostClassifier
	from sklearn.metrics import roc_auc_score
	kfold, scores, y_pred_totcb = StratifiedKFold(n_splits=5, shuffle=True, random_state=221), list(), []
	for train, test in kfold.split(X, y):
	x_train, x_test = X.iloc[train], X.iloc[test]
	y_train, y_test = y.iloc[train], y.iloc[test]

	model = CatBoostClassifier(random_state=27, n_estimators=3000, cat_features = cat_columns,
	max_depth=7, verbose=500, learning_rate=0.102, eval_metric='AUC')
	def remove_system_generated_msgs(uploaded_file: str) -> list:
	"""Remove system generated messages like::
	1. +234 was added
	2. +234 left etc

	uploaded_file: str: path of dataset

	Returns:
	--------
	list
	#removing emojis
	def remove_emoji(text: str) -> str:
	"""
	remove emojis from chats i think idont need

	args:
	----
	text: a single list of message

	Returns:
	import pandas as pd

	def processChatMsgs(wh_chat: list) -> pd.DataFrame:
	"""
	process the uploaded chat data by removing unwanted
	entries and returns a dataframe

	args:
	-----
	wh_chat: whatsapp chat data
	def get_total_msg(user:str, df:pd.DataFrame) -> str:
	"""
	Return the total msgs 'user' has sent
	Args:
	----
	user: the user to extract his/her total count of messages
	df: pd.DataFrame: dataframe that stores information about each user and message sent

	Returns:
	--------
	from nltk.corpus import stopwords
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud

	def draw_wordcloud(msgs: list):
	"""
	Draw wordcloud for visualization of the most used words
	during conversation
	args:
	-----
	def pie_chart(user):
	fig, ax = plt.subplots(figsize=(15, 8))
	explodex = []
	for i in np.arange(len(user)):
	explodex.append(0.005)
	ax = user.plot(kind='pie', colors=['red', 'green', 'cyan', 'lime', 'gold'],
	fontsize=12, autopct='%1.1f%%', startangle=180,
	pctdistance=0.85, explode=explodex)
	inner_circle = plt.Circle((0,0), 0.50, fc='white')
	fig = plt.gcf()