Skip to content

Instantly share code, notes, and snippets.

@zerebom
Last active September 16, 2020 15:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zerebom/55a2fd824624a089b1d6874071922999 to your computer and use it in GitHub Desktop.
Save zerebom/55a2fd824624a089b1d6874071922999 to your computer and use it in GitHub Desktop.
Kaggle snippets

pandas

time split

test_df = df2[df2['arrival_date'] > datetime.datetime(2017,6,30)].reset_index()
train_df = df2[(df2['arrival_date'] > datetime.datetime(2016,5,30))&(df2['arrival_date'] < datetime.datetime(2016,8,30))].reset_index()

to_timedate

import timedate
df['A'] = pd.to_datetime(df['A'])

結合

# 存在している dimに追加される。
np.concat([one,zero],axis=2)


# 新しいdimを作って結合する。
np.stack([one,zero],axis=0)

target encoding

agg_func = {'is_canceled':['mean','count']}
category_cols=['market_segment','customer_type','distribution_channel','country']
target_col ='is_canceled'

for category_col in category_cols:
    agg_df = train_df.groupby(category_col)[target_col].agg(agg_func)
    agg_df.columns = [category_col+'_'+'_'.join(col).strip() for col in agg_df.columns.values]
    for col in agg_df.columns.values:
        train_df[col] = train_df[category_col].map(agg_df[col]).copy()
        test_df[col] = test_df[category_col].map(agg_df[col]).copy()

If category_col is only one col, agg_func should change dict to list. And, agg_df.columns change to [category_col+'_'_col for in agg_df.columns.values]

categorical encoding

lgb

params

#reg
lgbm_params = {
    'num_iterations': 1000,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': -1,
    'colsample_bytree': 1.0,
    'colsample_bylevel': 0.3,
    'gamma': 0,
    'lambda': 1,
    'eta': 0.1,
    'min_child_weight': 1,
    'verbose': 1,
    'random_state': 0,
    'early_stopping_round': 50
}

# multi class
params = {
            "boosting_type": "gbdt",
            "objective": "multiclass",
            "metric": "multi_logloss",
            "learning_rate": 0.05,
            "colsample_bytree": 0.5,
            "reg_alpha": 0.0,
            "reg_lambda": 0.0,
            "min_split_gain": 10.0,
            "min_child_weight": 2000.0,
            "max_depth": 7,
            "num_class": 3,
            "num_leaves": 50,
        }

#binary
lgbm_params= {
    'num_iterations': 1000,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 7,
    'colsample_bytree': 1.0,
    'colsample_bylevel': 0.3,
    'gamma': 0,
    'lambda': 1,
    'eta': 0.05,
    'min_child_weight': 1,
    'verbosity': 50,
    'random_state': 0,
}

Kfolds predict Zoo

numpy ver

### TODO: add oof
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)
preds =np.zeros((len(test_X_arr),))
FIs = np.zeros(train_X_arr.shape[1])
for train_idx, val_idx in split.split(train_X_arr, train_y):
    lgb_train = lgb.Dataset(train_X_arr[train_idx,:], train_y[train_idx])
    lgb_eval = lgb.Dataset(train_X_arr[val_idx,:], train_y[val_idx], reference=lgb_train)
    model = lgb.train(lgbm_params, lgb_train, valid_sets=[lgb_train,lgb_eval],verbose_eval=250)    
    preds += model.predict(test_X_arr, num_iteration=model.best_iteration) / split.n_splits
    FIs += model.feature_importance('gain')

pandas ver

kfolds = 4 # 4 = 75% train, 25% validation
split = GroupKFold(n_splits=kfolds)
group = train['group']

preds =np.zeros((len(test),))
oof =np.zeros((len(train),))

FIs = np.zeros(train.shape[1])
for train_idx, val_idx in split.split(train[col], y, group):
    lgb_train = lgb.Dataset(train.loc[train_idx,col], y[train_idx])
    lgb_eval = lgb.Dataset(train.loc[val_idx,col], y[val_idx], reference=lgb_train)
    model = lgb.train(params, lgb_train,\
                      num_iterations,lgb_eval,\
                      verbose_eval=100, early_stopping_rounds=200, feval=MacroF1Metric)
    oof[val_idx] =  model.predict(train.loc[val_idx,col], num_iteration=model.best_iteration)
    preds += model.predict(test[col], num_iteration=model.best_iteration) / split.n_splits
    FIs += model.feature_importance('gain')

func type

def train_lgb(train_X, train_y, split, test_X, num_iterations=5000):
    preds = np.zeros(len(test_X))
    oof = np.zeros(len(train_X))
    FIs = np.zeros(train_X.shape[1])

    for train_idx, val_idx in split.split(train_X, train_y):
        lgb_train = lgb.Dataset(train_X.loc[train_idx,:], train_y[train_idx])
        lgb_eval = lgb.Dataset(train_X.loc[val_idx,:], train_y[val_idx], reference=lgb_train)
        model = lgb.train(params, lgb_train, num_iterations, lgb_eval,\
                          verbose_eval=100, early_stopping_rounds=200)
        
        oof[val_idx] =  model.predict(train_X.loc[val_idx,:], num_iteration=model.best_iteration)
        preds += model.predict(test_X, num_iteration=model.best_iteration) / split.n_splits
        FIs += model.feature_importance('gain')
        
    return oof, preds, FIs

split = KFold(n_splits=5,random_state=42,shuffle=True)
oof, preds, FIs = train_lgb(train_X, train_y, split, test_X, num_iterations=5000)

FIs plot

def get_FI_plot(FIs, columns, max_row=500, figsize=(8, 20), save_path=None):
    fig, ax = plt.subplots(figsize=figsize)
    df = pd.DataFrame({'FI': FIs, 'col': columns})
    df = df.sort_values('FI', ascending=False).reset_index(drop=True).iloc[:max_row, :]
    sns.barplot(x='FI', y='col', data=df, ax=ax)
    if save_path:
        plt.savefig(save_path)

expand multiple str choice columns

Function to expand and store a column containing multiple values with separate characters.

def expand_multiple_choice_columns(df,multiple_cols,symbol=';'):
    for c in multiple_cols:
        # Check if there are multiple entries in this column
        temp = df[c].str.split(';', expand=True)

        # Get all the possible values in this column
        new_columns = pd.unique(temp.values.ravel())
        for new_c in new_columns:
            if new_c and new_c is not np.nan:
                # Create new column for each unique column
                idx = df[c].str.contains(new_c, regex=False).fillna(False)
                df.loc[idx, f"{c}_{new_c}"] = 1
        print(f">> Multiple entries in {c}. Added {len(new_columns)} one-hot-encoding columns")
        # Drop the original column
        df.drop(c, axis=1, inplace=True)
    return df

encoding all str label to categorical

def str_cols2num(df):
    str_cols = df.select_dtypes(include='object').columns
    str_dict = defaultdict(LabelEncoder)

    df[str_cols] = df[str_cols].fillna('Nan')
    df[str_cols] = df[str_cols].apply(lambda x: str_dict[x.name].fit_transform(x))
    # df[str_cols] = df[str_cols].apply(lambda x: d[x.name].inverse_transform(x))
    df[str_cols] = df[str_cols].astype('category')
    return df,str_dict,str_cols

data, str_dict, str_cols = str_cols2num(data)

convert ordinal variable to numeric

# ex dictionary
company_size_dict = {
    'Fewer than 10 employees': 1,
    '10 to 19 employees': 2,
}

def convert_ordinal_variable_to_numeric(df, cols, correspondence_dict):
    for col in cols:
        for key, value in correspondence_dict.items():
            df.loc[df[col]==key, col] = value
        df[col] = df[col].fillna(-1)
        print(df[col].unique())
    return df

data = convert_ordinal_variable_to_numeric(data,['CompanySize'],company_size_dict)

fillna

fillna stackover

一つの列で任意のカテゴリ(複数)を含む行だけ抽出

df.loc[df[subset_col].str.contains('|'.join(subset_values)).fillna(False)]

fillna with mean

num_cols = df.select_dtypes(exclude='object').columns
df[num_cols] = df[num_cols].fillna(df.median())

label encoding

testに未知のcategoryがあるとErrorになる

le = LabelEncoder()
le_dict = {}
for col in cat_features:
    train_X[col] = le.fit_transform(train_X[col])
    test_X[col] = le.transform(test_X[col])
    le_dict['col']=le
    
train_X[cat_features] = train_X[cat_features].astype('category')
test_X[cat_features] = test_X[cat_features].astype('category')

label_encoding2

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

str_cols = df.select_dtypes(include='object').columns
str_dict = defaultdict(LabelEncoder)

df[str_cols] = df[str_cols].fillna('Nan')
df[str_cols] = df[str_cols].apply(lambda x: str_dict[x.name].fit_transform(x))
# df[str_cols] = df[str_cols].apply(lambda x: d[x.name].inverse_transform(x))

plot FI

df = pd.DataFrame({'FI': FIs, 'col': train_X.columns})
df = df.sort_values('FI', ascending=False).reset_index(drop=True)
sns.barplot(x='FI', y='col', data=df)
plt.show()

Add batch_group from the biginning to the end

batch_size = 4000
#predのとこはなんでも良い
oof_df.groupby(oof_df.index//batch_size, sort=False)['pred'].agg(['ngroup']).values

Get category nth

あるカテゴリの中で出現頻度上位n番の要素を列挙する
src:https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation
もっと洗練された書き方がありそう

#各行にそのキャラクターの登場回数を追加
whole_df['character_ncount'] = whole_df.groupby(['name','character'])['id'].transform('count')
#漫画(name)と登場回数でソート
whole_df.sort_values(['name','character_ncount'],inplace=True,ascending=False)
#同一キャラクターを表す行を取り除いて、漫画ごとにgroupby,上位五件だけ取得
whole_df.drop_duplicates(subset=['character','name']).groupby('name').head(5)[['name','character','character_ncount']]

select specific type of columns

df.select_dtypes(include='object')

check overall trends

def plot_each_features(df, nrow=4, ncol=5, figsize=(20,8), res=100):
    '''
    Plot the index vs. value for each column
    '''
    fig, axes = plt.subplots(nrow, ncol,figsize=figsize)
    axes=axes.flatten()
    for col,ax in zip (df.columns,axes):
        ax.plot(df[col][0::res])
        ax.set_title(col)
    plt.tight_layout()
    plt.show()

timeseries

timeseries vs a variable

category列とnumeric列をlongで保持

resort_df.groupby('arrival_date')['hotel'].count().plot(figsize=(15,5),color=next(color_cycle))
city_df.groupby('arrival_date')['hotel'].count().plot(figsize=(15,5),color=next(color_cycle))

plt.tight_layout()
plt.title('Daily booking access')
plt.legend(['resort','hotel'])
plt.plot()
plt.figure(figsize=(15,10))
plt.plot(train['time'],train['signal'],color=next(color_cycle))
plt.show()

Thinned out timeseries plot

plt.figure(figsize=(20,5)); res = 1000
plt.plot(range(0,train.shape[0],res),train.open_channels[0::res])
for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')
for j in range(10): plt.text(j*500000+200000,10,str(j+1),size=20)
plt.xlabel('Row',size=16); plt.ylabel('Signal',size=16); 
plt.title('Training Data Open Channel - 10 batches',size=20)
plt.show()

plotly

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1)

x_1 = train.loc[:100]["time"]
y_1 = train.loc[:100]["signal"]
x_2 = train.loc[100:200]["time"]
y_2 = train.loc[100:200]["signal"]

fig.add_trace(go.Scatter(x=x_1, y=y_1, showlegend=False,
                    mode='lines+markers', name="First sample",marker=dict(color=next(color_cycle))),row=1, col=1)

fig.add_trace(go.Scatter(x=x_2, y=y_2, showlegend=False,
                    mode='lines+markers', name="Second sample",
                         marker=dict(color=next(color_cycle))),
             row=2, col=1)

fig.update_layout(height=1200, width=800, title_text="Sample signals")
fig.show()

categorical

multi category vs boxplot with mean

plotly

df = pd.DataFrame(np.transpose([np.mean(np.abs(signals), axis=1), targets]))
df.columns = ["signal_mean", "open_channels"]
fig = go.Figure()

channels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for channel in channels:
    fig.add_trace(go.Box(x=df['open_channels'][df['open_channels'] == channel],
                         y=df['signal_mean'][df['open_channels'] == channel],
                         name=channel,
                         marker=dict(color='seagreen'), showlegend=False)
                         )
    
fig.add_trace(go.Scatter(x=channels,
                         y=[df['signal_mean'][df['open_channels'] == channel].median() for channel in channels],
                         mode="lines+markers",
                         name=channel,
                         marker=dict(color='seagreen'), showlegend=False)
                         )

fig.update_layout(title="Signal mean vs. Open channels", xaxis_title="Open channels", yaxis_title="Signal mean")
fig.show()

verify that the DataFrame does not contain any odd values

def valid_dataframe(df):
    nulls = df.isnull().sum().sum()
    assert nulls == 0 , f'df includes null value at {df.isnull().any(axis=1).values}'
    assert len(np.unique(df.columns)) == len(df.columns) , f'df includes same name columns'
    return True

macro f1 score

def MacroF1Metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(int)
    score = f1_score(labels, preds, average = 'macro')
    return ('MacroF1Metric', score, True)
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


seed_everything(SEED)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 15:28:58 2018
@author: kazuki.onodera
"""
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
def df_info(target_df, topN=10):
max_row = target_df.shape[0]
print(f'Shape: {target_df.shape}')
df = target_df.dtypes.to_frame()
df.columns = ['DataType']
df['#Nulls'] = target_df.isnull().sum()
df['#Uniques'] = target_df.nunique()
# stats
df['Min'] = target_df.min(numeric_only=True)
df['Mean'] = target_df.mean(numeric_only=True)
df['Max'] = target_df.max(numeric_only=True)
df['Std'] = target_df.std(numeric_only=True)
# top 10 values
df[f'top{topN} val'] = 0
df[f'top{topN} cnt'] = 0
df[f'top{topN} raito'] = 0
for c in df.index:
vc = target_df[c].value_counts().head(topN)
val = list(vc.index)
cnt = list(vc.values)
raito = list((vc.values / max_row).round(2))
df.loc[c, f'top{topN} val'] = str(val)
df.loc[c, f'top{topN} cnt'] = str(cnt)
df.loc[c, f'top{topN} raito'] = str(raito)
return df
def top_categories(df, category_feature, topN=30):
return df[category_feature].value_counts().head(topN).index
def count_categories(df, category_features, topN=30, sort='freq', df2=None):
for c in category_features:
target_value = df[c].value_counts().head(topN).index
if sort=='freq':
order = target_value
elif sort=='alphabetic':
order = df[c].value_counts().head(topN).sort_index().index
if df2 is not None:
plt.subplot(1, 2, 1)
sns.countplot(x=c, data=df[df[c].isin(order)], order=order)
plt.xticks(rotation=90)
if df2 is not None:
plt.subplot(1, 2, 2)
sns.countplot(x=c, data=df2[df2[c].isin(order)], order=order)
plt.xticks(rotation=90)
if df2 is not None:
plt.suptitle(f'{c} TOP{topN}', size=25)
else:
plt.title(f'{c} TOP{topN}', size=25)
plt.tight_layout()
plt.show()
return
def hist_continuous(df, continuous_features, bins=30, df2=None):
for c in continuous_features:
if df2 is not None:
plt.subplot(1, 2, 1)
df[c].hist(bins=bins)
if df2 is not None:
plt.subplot(1, 2, 2)
df2[c].hist(bins=bins)
if df2 is not None:
plt.suptitle(f'{c}', size=25)
else:
plt.title(f'{c}', size=25)
plt.tight_layout()
plt.show()
return
def venn_diagram(train, test, category_features, names=('train', 'test'), figsize=(18,13)):
"""
category_features: max==6
"""
n = int(np.ceil(len(category_features)/2))
plt.figure(figsize=figsize)
for i,c in enumerate(category_features):
plt.subplot(int(f'{n}2{i+1}'))
venn2([set(train[c].unique()), set(test[c].unique())],
set_labels = names )
plt.title(f'{c}', fontsize=18)
plt.show()
return
def split_seq(iterable, size):
"""
In: list(split_seq(range(9), 4))
Out: [[0, 1, 2, 3], [4, 5, 6, 7], [8]]
"""
it = iter(iterable)
item = list(itertools.islice(it, size))
while item:
yield item
item = list(itertools.islice(it, size))

general

  1. 集約統計量をチェック
  2. 外れ値

regression

  1. X:カテゴリ変数,Y:目的変数 のバイオリンプロット
  2. X:連続変数, Y:目的変数 の散布図

binary classification(2class)

multi classification

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment