zerebom/00_timeseries.md

## 00_timeseries.md

      
    Raw
  

              00_timeseries.md
            
          
    pandas

time split

test_df = df2[df2['arrival_date'] > datetime.datetime(2017,6,30)].reset_index()
train_df = df2[(df2['arrival_date'] > datetime.datetime(2016,5,30))&(df2['arrival_date'] < datetime.datetime(2016,8,30))].reset_index()
to_timedate

import timedate
df['A'] = pd.to_datetime(df['A'])

  
## 010_numpy.md

      
    Raw
  

              010_numpy.md
            
          
    結合

# 存在している dimに追加される。
np.concat([one,zero],axis=2)


# 新しいdimを作って結合する。
np.stack([one,zero],axis=0)

  
## 01_feature_engineering.md

      
    Raw
  

              01_feature_engineering.md
            
          
    target encoding

agg_func = {'is_canceled':['mean','count']}
category_cols=['market_segment','customer_type','distribution_channel','country']
target_col ='is_canceled'

for category_col in category_cols:
    agg_df = train_df.groupby(category_col)[target_col].agg(agg_func)
    agg_df.columns = [category_col+'_'+'_'.join(col).strip() for col in agg_df.columns.values]
    for col in agg_df.columns.values:
        train_df[col] = train_df[category_col].map(agg_df[col]).copy()
        test_df[col] = test_df[category_col].map(agg_df[col]).copy()
If category_col is only one col, agg_func should change dict to list.
And, agg_df.columns change to [category_col+'_'_col for in agg_df.columns.values]
categorical encoding


## 02_modeling.md

      
    Raw
  

              02_modeling.md
            
          
    lgb

params

#reg
lgbm_params = {
    'num_iterations': 1000,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': -1,
    'colsample_bytree': 1.0,
    'colsample_bylevel': 0.3,
    'gamma': 0,
    'lambda': 1,
    'eta': 0.1,
    'min_child_weight': 1,
    'verbose': 1,
    'random_state': 0,
    'early_stopping_round': 50
}

# multi class
params = {
            "boosting_type": "gbdt",
            "objective": "multiclass",
            "metric": "multi_logloss",
            "learning_rate": 0.05,
            "colsample_bytree": 0.5,
            "reg_alpha": 0.0,
            "reg_lambda": 0.0,
            "min_split_gain": 10.0,
            "min_child_weight": 2000.0,
            "max_depth": 7,
            "num_class": 3,
            "num_leaves": 50,
        }

#binary
lgbm_params= {
    'num_iterations': 1000,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 7,
    'colsample_bytree': 1.0,
    'colsample_bylevel': 0.3,
    'gamma': 0,
    'lambda': 1,
    'eta': 0.05,
    'min_child_weight': 1,
    'verbosity': 50,
    'random_state': 0,
}
Kfolds predict Zoo


numpy ver

### TODO: add oof
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)
preds =np.zeros((len(test_X_arr),))
FIs = np.zeros(train_X_arr.shape[1])
for train_idx, val_idx in split.split(train_X_arr, train_y):
    lgb_train = lgb.Dataset(train_X_arr[train_idx,:], train_y[train_idx])
    lgb_eval = lgb.Dataset(train_X_arr[val_idx,:], train_y[val_idx], reference=lgb_train)
    model = lgb.train(lgbm_params, lgb_train, valid_sets=[lgb_train,lgb_eval],verbose_eval=250)    
    preds += model.predict(test_X_arr, num_iteration=model.best_iteration) / split.n_splits
    FIs += model.feature_importance('gain')
pandas ver

kfolds = 4 # 4 = 75% train, 25% validation
split = GroupKFold(n_splits=kfolds)
group = train['group']

preds =np.zeros((len(test),))
oof =np.zeros((len(train),))

FIs = np.zeros(train.shape[1])
for train_idx, val_idx in split.split(train[col], y, group):
    lgb_train = lgb.Dataset(train.loc[train_idx,col], y[train_idx])
    lgb_eval = lgb.Dataset(train.loc[val_idx,col], y[val_idx], reference=lgb_train)
    model = lgb.train(params, lgb_train,\
                      num_iterations,lgb_eval,\
                      verbose_eval=100, early_stopping_rounds=200, feval=MacroF1Metric)
    oof[val_idx] =  model.predict(train.loc[val_idx,col], num_iteration=model.best_iteration)
    preds += model.predict(test[col], num_iteration=model.best_iteration) / split.n_splits
    FIs += model.feature_importance('gain')
func type

def train_lgb(train_X, train_y, split, test_X, num_iterations=5000):
    preds = np.zeros(len(test_X))
    oof = np.zeros(len(train_X))
    FIs = np.zeros(train_X.shape[1])

    for train_idx, val_idx in split.split(train_X, train_y):
        lgb_train = lgb.Dataset(train_X.loc[train_idx,:], train_y[train_idx])
        lgb_eval = lgb.Dataset(train_X.loc[val_idx,:], train_y[val_idx], reference=lgb_train)
        model = lgb.train(params, lgb_train, num_iterations, lgb_eval,\
                          verbose_eval=100, early_stopping_rounds=200)
        
        oof[val_idx] =  model.predict(train_X.loc[val_idx,:], num_iteration=model.best_iteration)
        preds += model.predict(test_X, num_iteration=model.best_iteration) / split.n_splits
        FIs += model.feature_importance('gain')
        
    return oof, preds, FIs

split = KFold(n_splits=5,random_state=42,shuffle=True)
oof, preds, FIs = train_lgb(train_X, train_y, split, test_X, num_iterations=5000)

FIs plot

def get_FI_plot(FIs, columns, max_row=500, figsize=(8, 20), save_path=None):
    fig, ax = plt.subplots(figsize=figsize)
    df = pd.DataFrame({'FI': FIs, 'col': columns})
    df = df.sort_values('FI', ascending=False).reset_index(drop=True).iloc[:max_row, :]
    sns.barplot(x='FI', y='col', data=df, ax=ax)
    if save_path:
        plt.savefig(save_path)

  
## 03_preprocess.md

      
    Raw
  

              03_preprocess.md
            
          
    expand multiple str choice columns

Function to expand and store a column containing multiple values with separate characters.
def expand_multiple_choice_columns(df,multiple_cols,symbol=';'):
    for c in multiple_cols:
        # Check if there are multiple entries in this column
        temp = df[c].str.split(';', expand=True)

        # Get all the possible values in this column
        new_columns = pd.unique(temp.values.ravel())
        for new_c in new_columns:
            if new_c and new_c is not np.nan:
                # Create new column for each unique column
                idx = df[c].str.contains(new_c, regex=False).fillna(False)
                df.loc[idx, f"{c}_{new_c}"] = 1
        print(f">> Multiple entries in {c}. Added {len(new_columns)} one-hot-encoding columns")
        # Drop the original column
        df.drop(c, axis=1, inplace=True)
    return df
encoding all str label to categorical

def str_cols2num(df):
    str_cols = df.select_dtypes(include='object').columns
    str_dict = defaultdict(LabelEncoder)

    df[str_cols] = df[str_cols].fillna('Nan')
    df[str_cols] = df[str_cols].apply(lambda x: str_dict[x.name].fit_transform(x))
    # df[str_cols] = df[str_cols].apply(lambda x: d[x.name].inverse_transform(x))
    df[str_cols] = df[str_cols].astype('category')
    return df,str_dict,str_cols

data, str_dict, str_cols = str_cols2num(data)
convert ordinal variable to numeric

# ex dictionary
company_size_dict = {
    'Fewer than 10 employees': 1,
    '10 to 19 employees': 2,
}

def convert_ordinal_variable_to_numeric(df, cols, correspondence_dict):
    for col in cols:
        for key, value in correspondence_dict.items():
            df.loc[df[col]==key, col] = value
        df[col] = df[col].fillna(-1)
        print(df[col].unique())
    return df

data = convert_ordinal_variable_to_numeric(data,['CompanySize'],company_size_dict)
fillna

fillna stackover
一つの列で任意のカテゴリ(複数)を含む行だけ抽出

df.loc[df[subset_col].str.contains('|'.join(subset_values)).fillna(False)]
fillna with mean

num_cols = df.select_dtypes(exclude='object').columns
df[num_cols] = df[num_cols].fillna(df.median())
label encoding

testに未知のcategoryがあるとErrorになる
le = LabelEncoder()
le_dict = {}
for col in cat_features:
    train_X[col] = le.fit_transform(train_X[col])
    test_X[col] = le.transform(test_X[col])
    le_dict['col']=le
    
train_X[cat_features] = train_X[cat_features].astype('category')
test_X[cat_features] = test_X[cat_features].astype('category')
label_encoding2

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

str_cols = df.select_dtypes(include='object').columns
str_dict = defaultdict(LabelEncoder)

df[str_cols] = df[str_cols].fillna('Nan')
df[str_cols] = df[str_cols].apply(lambda x: str_dict[x.name].fit_transform(x))
# df[str_cols] = df[str_cols].apply(lambda x: d[x.name].inverse_transform(x))
plot FI

df = pd.DataFrame({'FI': FIs, 'col': train_X.columns})
df = df.sort_values('FI', ascending=False).reset_index(drop=True)
sns.barplot(x='FI', y='col', data=df)
plt.show()
Add batch_group　from the biginning to the end

batch_size = 4000
#predのとこはなんでも良い
oof_df.groupby(oof_df.index//batch_size, sort=False)['pred'].agg(['ngroup']).values
Get category nth

あるカテゴリの中で出現頻度上位n番の要素を列挙する

src:https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation

もっと洗練された書き方がありそう
#各行にそのキャラクターの登場回数を追加
whole_df['character_ncount'] = whole_df.groupby(['name','character'])['id'].transform('count')
#漫画(name)と登場回数でソート
whole_df.sort_values(['name','character_ncount'],inplace=True,ascending=False)
#同一キャラクターを表す行を取り除いて、漫画ごとにgroupby,上位五件だけ取得
whole_df.drop_duplicates(subset=['character','name']).groupby('name').head(5)[['name','character','character_ncount']]
select specific type of columns

df.select_dtypes(include='object')

  
## 04_EDA.md

      
    Raw
  

              04_EDA.md
            
          
    check overall trends

def plot_each_features(df, nrow=4, ncol=5, figsize=(20,8), res=100):
    '''
    Plot the index vs. value for each column
    '''
    fig, axes = plt.subplots(nrow, ncol,figsize=figsize)
    axes=axes.flatten()
    for col,ax in zip (df.columns,axes):
        ax.plot(df[col][0::res])
        ax.set_title(col)
    plt.tight_layout()
    plt.show()
timeseries

timeseries vs a variable

category列とnumeric列をlongで保持
resort_df.groupby('arrival_date')['hotel'].count().plot(figsize=(15,5),color=next(color_cycle))
city_df.groupby('arrival_date')['hotel'].count().plot(figsize=(15,5),color=next(color_cycle))

plt.tight_layout()
plt.title('Daily booking access')
plt.legend(['resort','hotel'])
plt.plot()
plt.figure(figsize=(15,10))
plt.plot(train['time'],train['signal'],color=next(color_cycle))
plt.show()
Thinned out timeseries plot

plt.figure(figsize=(20,5)); res = 1000
plt.plot(range(0,train.shape[0],res),train.open_channels[0::res])
for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for j in range(10): plt.text(j*500000+200000,10,str(j+1),size=20)
plt.xlabel('Row',size=16); plt.ylabel('Signal',size=16); 
plt.title('Training Data Open Channel - 10 batches',size=20)
plt.show()
plotly

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1)

x_1 = train.loc[:100]["time"]
y_1 = train.loc[:100]["signal"]
x_2 = train.loc[100:200]["time"]
y_2 = train.loc[100:200]["signal"]

fig.add_trace(go.Scatter(x=x_1, y=y_1, showlegend=False,
                    mode='lines+markers', name="First sample",marker=dict(color=next(color_cycle))),row=1, col=1)

fig.add_trace(go.Scatter(x=x_2, y=y_2, showlegend=False,
                    mode='lines+markers', name="Second sample",
                         marker=dict(color=next(color_cycle))),
             row=2, col=1)

fig.update_layout(height=1200, width=800, title_text="Sample signals")
fig.show()
categorical

multi category vs boxplot with mean

plotly

df = pd.DataFrame(np.transpose([np.mean(np.abs(signals), axis=1), targets]))
df.columns = ["signal_mean", "open_channels"]
fig = go.Figure()

channels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for channel in channels:
    fig.add_trace(go.Box(x=df['open_channels'][df['open_channels'] == channel],
                         y=df['signal_mean'][df['open_channels'] == channel],
                         name=channel,
                         marker=dict(color='seagreen'), showlegend=False)
                         )
    
fig.add_trace(go.Scatter(x=channels,
                         y=[df['signal_mean'][df['open_channels'] == channel].median() for channel in channels],
                         mode="lines+markers",
                         name=channel,
                         marker=dict(color='seagreen'), showlegend=False)
                         )

fig.update_layout(title="Signal mean vs. Open channels", xaxis_title="Open channels", yaxis_title="Signal mean")
fig.show()

  
## 05_assertion_codes.md

      
    Raw
  

              05_assertion_codes.md
            
          
    verify that the DataFrame does not contain any odd values

def valid_dataframe(df):
    nulls = df.isnull().sum().sum()
    assert nulls == 0 , f'df includes null value at {df.isnull().any(axis=1).values}'
    assert len(np.unique(df.columns)) == len(df.columns) , f'df includes same name columns'
    return True

  
## 06_criterion.md

      
    Raw
  

              06_criterion.md
            
          
    macro f1 score

def MacroF1Metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(int)
    score = f1_score(labels, preds, average = 'macro')
    return ('MacroF1Metric', score, True)

  
## 07_utils.md

      
    Raw
  

              07_utils.md
            
          
    def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


seed_everything(SEED)

  
## 08_firstEDA.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May  9 15:28:58 2018
@author: kazuki.onodera
"""

import numpy as np
import pandas as pd
import itertools
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib_venn import venn2

def df_info(target_df, topN=10):

    max_row = target_df.shape[0]
    print(f'Shape: {target_df.shape}')

    df = target_df.dtypes.to_frame()
    df.columns = ['DataType']
    df['#Nulls'] = target_df.isnull().sum()
    df['#Uniques'] = target_df.nunique()

    # stats
    df['Min']   = target_df.min(numeric_only=True)
    df['Mean']  = target_df.mean(numeric_only=True)
    df['Max']   = target_df.max(numeric_only=True)
    df['Std']   = target_df.std(numeric_only=True)

    # top 10 values
    df[f'top{topN} val'] = 0
    df[f'top{topN} cnt'] = 0
    df[f'top{topN} raito'] = 0
    for c in df.index:
        vc = target_df[c].value_counts().head(topN)
        val = list(vc.index)
        cnt = list(vc.values)
        raito = list((vc.values / max_row).round(2))
        df.loc[c, f'top{topN} val'] = str(val)
        df.loc[c, f'top{topN} cnt'] = str(cnt)
        df.loc[c, f'top{topN} raito'] = str(raito)

    return df

def top_categories(df, category_feature, topN=30):
    return df[category_feature].value_counts().head(topN).index

def count_categories(df, category_features, topN=30, sort='freq', df2=None):

    for c in category_features:
        target_value = df[c].value_counts().head(topN).index
        if sort=='freq':
            order = target_value
        elif sort=='alphabetic':
            order = df[c].value_counts().head(topN).sort_index().index

        if df2 is not None:
            plt.subplot(1, 2, 1)
        sns.countplot(x=c, data=df[df[c].isin(order)], order=order)
        plt.xticks(rotation=90)

        if df2 is not None:
            plt.subplot(1, 2, 2)
            sns.countplot(x=c, data=df2[df2[c].isin(order)], order=order)
            plt.xticks(rotation=90)

        if df2 is not None:
            plt.suptitle(f'{c} TOP{topN}', size=25)
        else:
            plt.title(f'{c} TOP{topN}', size=25)
        plt.tight_layout()
        plt.show()

    return

def hist_continuous(df, continuous_features, bins=30, df2=None):

    for c in continuous_features:
        if df2 is not None:
            plt.subplot(1, 2, 1)
        df[c].hist(bins=bins)

        if df2 is not None:
            plt.subplot(1, 2, 2)
            df2[c].hist(bins=bins)

        if df2 is not None:
            plt.suptitle(f'{c}', size=25)
        else:
            plt.title(f'{c}', size=25)
        plt.tight_layout()
        plt.show()

    return

def venn_diagram(train, test, category_features, names=('train', 'test'), figsize=(18,13)):
    """
    category_features: max==6
    """
    n = int(np.ceil(len(category_features)/2))
    plt.figure(figsize=figsize)

    for i,c in enumerate(category_features):
        plt.subplot(int(f'{n}2{i+1}'))
        venn2([set(train[c].unique()), set(test[c].unique())],
               set_labels = names )
        plt.title(f'{c}', fontsize=18)
    plt.show()

    return

def split_seq(iterable, size):
    """
    In: list(split_seq(range(9), 4))
    Out: [[0, 1, 2, 3], [4, 5, 6, 7], [8]]
    """
    it = iter(iterable)
    item = list(itertools.islice(it, size))
    while item:
        yield item
        item = list(itertools.islice(it, size))

## 09_eda_action_list.md

      
    Raw
  

              09_eda_action_list.md
            
          
    general


集約統計量をチェック
外れ値

regression


X:カテゴリ変数,Y:目的変数 のバイオリンプロット
X:連続変数, Y:目的変数 の散布図

binary classification(2class)

multi classification


## 999_usefull_link.md

      
    Raw
  

              999_usefull_link.md
            
          
    (MLBearさん pandasコード集)[https://naotaka1128.hatenadiary.jp/entry/pandas-start-guide]
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Wed May 9 15:28:58 2018
	@author: kazuki.onodera
	"""

	import numpy as np
	import pandas as pd
	import itertools
	import seaborn as sns
	from matplotlib import pyplot as plt
	from matplotlib_venn import venn2

	def df_info(target_df, topN=10):

	max_row = target_df.shape[0]
	print(f'Shape: {target_df.shape}')

	df = target_df.dtypes.to_frame()
	df.columns = ['DataType']
	df['#Nulls'] = target_df.isnull().sum()
	df['#Uniques'] = target_df.nunique()

	# stats
	df['Min'] = target_df.min(numeric_only=True)
	df['Mean'] = target_df.mean(numeric_only=True)
	df['Max'] = target_df.max(numeric_only=True)
	df['Std'] = target_df.std(numeric_only=True)

	# top 10 values
	df[f'top{topN} val'] = 0
	df[f'top{topN} cnt'] = 0
	df[f'top{topN} raito'] = 0
	for c in df.index:
	vc = target_df[c].value_counts().head(topN)
	val = list(vc.index)
	cnt = list(vc.values)
	raito = list((vc.values / max_row).round(2))
	df.loc[c, f'top{topN} val'] = str(val)
	df.loc[c, f'top{topN} cnt'] = str(cnt)
	df.loc[c, f'top{topN} raito'] = str(raito)

	return df

	def top_categories(df, category_feature, topN=30):
	return df[category_feature].value_counts().head(topN).index

	def count_categories(df, category_features, topN=30, sort='freq', df2=None):

	for c in category_features:
	target_value = df[c].value_counts().head(topN).index
	if sort=='freq':
	order = target_value
	elif sort=='alphabetic':
	order = df[c].value_counts().head(topN).sort_index().index

	if df2 is not None:
	plt.subplot(1, 2, 1)
	sns.countplot(x=c, data=df[df[c].isin(order)], order=order)
	plt.xticks(rotation=90)

	if df2 is not None:
	plt.subplot(1, 2, 2)
	sns.countplot(x=c, data=df2[df2[c].isin(order)], order=order)
	plt.xticks(rotation=90)

	if df2 is not None:
	plt.suptitle(f'{c} TOP{topN}', size=25)
	else:
	plt.title(f'{c} TOP{topN}', size=25)
	plt.tight_layout()
	plt.show()

	return

	def hist_continuous(df, continuous_features, bins=30, df2=None):

	for c in continuous_features:
	if df2 is not None:
	plt.subplot(1, 2, 1)
	df[c].hist(bins=bins)

	if df2 is not None:
	plt.subplot(1, 2, 2)
	df2[c].hist(bins=bins)

	if df2 is not None:
	plt.suptitle(f'{c}', size=25)
	else:
	plt.title(f'{c}', size=25)
	plt.tight_layout()
	plt.show()

	return

	def venn_diagram(train, test, category_features, names=('train', 'test'), figsize=(18,13)):
	"""
	category_features: max==6
	"""
	n = int(np.ceil(len(category_features)/2))
	plt.figure(figsize=figsize)

	for i,c in enumerate(category_features):
	plt.subplot(int(f'{n}2{i+1}'))
	venn2([set(train[c].unique()), set(test[c].unique())],
	set_labels = names )
	plt.title(f'{c}', fontsize=18)
	plt.show()

	return

	def split_seq(iterable, size):
	"""
	In: list(split_seq(range(9), 4))
	Out: [[0, 1, 2, 3], [4, 5, 6, 7], [8]]
	"""
	it = iter(iterable)
	item = list(itertools.islice(it, size))
	while item:
	yield item
	item = list(itertools.islice(it, size))