Skip to content

Instantly share code, notes, and snippets.

@fanannan
Created May 4, 2018 07:27
Show Gist options
  • Save fanannan/075fc692f62ba49decace2b1c2c20467 to your computer and use it in GitHub Desktop.
Save fanannan/075fc692f62ba49decace2b1c2c20467 to your computer and use it in GitHub Desktop.
Embedding category labels to multiple dimensions
# coding: utf-8
import os
import shutil
import pandas as pd
import numpy as np
import math
import quandl
from fastai.structured import add_datepart, proc_df
from fastai.column_data import ColumnarModelData
import matplotlib as mpl
import matplotlib.pyplot as plt
from joblib import Memory
from tempfile import gettempdir
memory = Memory(cachedir=gettempdir(), verbose=0)
# 推定対象の時系列データを取得する
@memory.cache
def read_df(target_column='target', change=False, VIX=True):
if VIX:
if True:
df = quandl.get("CBOE/VIX")
ORIGINAL_TARGET = "VIX Close"
else:
raise Exception()
# "Date" を通常の列項目にする
df = df.reset_index()
else:
# for the case of Sun Spots
url="https://raw.githubusercontent.com/bblais/Python-for-Science/master/examples/spot_num.txt"
df = pd.read_csv(url, sep='\s+')
df["Month"] = df["YEAR"].apply(str)+"-"+df["MON"].apply(str)
df["Date"] = pd.to_datetime(df["Month"], format='%Y-%m')
ORIGINAL_TARGET = 'SSN'
# 加工
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(method='ffill').dropna()
df = df.rename(columns={ORIGINAL_TARGET: target_column})
if change:
df[target_column] = df[target_column].pct_change()+1.0
return df
# 日付情報の展開
def add_date_features(df):
add_datepart(df, "Date")
print('date features: ', df.columns)
return df
# ラグ項目の作成
def make_lag_labels(num_lags):
return [f"lag_{i}" for i in range(1, num_lags+1)]
# ラグ項目の追加
def add_lags(df, num_lags, target_column='target'):
for i, lag_label in enumerate(make_lag_labels(num_lags)):
df[lag_label] = df[target_column].shift(i+1).astype('float32')
return df
# カテゴリー項目の型確定(整数)と使用項目の特定
def arrange_data(df, num_lags, categories, target_column='target'):
for v in categories:
df[v] = df[v].astype('category').cat.as_ordered()
lag_labels = make_lag_labels(num_lags)
df_categorized = df[categories+lag_labels+[target_column]]
return df_categorized
# RMSPE(対数値を戻して評価)
def exp_rmspe(pred_values, target_values):
inv_targets = np.exp(target_values)
inv_preds = np.exp(pred_values)
pct_var = (inv_targets - inv_preds)/inv_targets
return math.sqrt((pct_var**2).mean())
# MSE(対数値を戻して評価)
def mse(pred_values, target_values):
inv_targets = np.exp(target_values)
inv_preds = np.exp(pred_values)
diff = inv_preds-inv_targets
return math.sqrt((diff**2).mean())
# バリデーション用の行番号を返す
def make_validation_index(df, train_ratio):
size = len(df)
train_size = int(size * train_ratio)
validation_index = list(range(train_size, len(df)))
return train_size, validation_index
# 学習用データを用意する
def make_train_data(df, target_column='target'):
# スケーリングをして、学習用データを作成する
# nas, mapperはスケーリング等の情報として、未知データの処理の際用いる
df_train, target_values, nas, mapper = proc_df(df.copy(), target_column, do_scale=True, skip_flds=[])
# 対数で処理するので、下駄を履かせる
log_target_values = np.log(target_values + 0.01).astype(np.float32)
return df_train, log_target_values, nas, mapper
# モデルを構築する
def build(path, df_categorized, log_target_values, validation_index, categories, max_embedding_dimensions, batch_size):
md = ColumnarModelData.from_data_frame(path, validation_index, df_categorized, log_target_values, cat_flds=categories, bs=batch_size)
# カテゴリー名とそのラベル数のタプルのリスト
cat_size_pairs = [(c, max(df_categorized[c])+1) for c in categories]
# カテゴリー名とそのラベル数のタプルのリスト
embedding_size_pairs = [(num, min(max_embedding_dimensions, (num+1)//2)) for _, num in cat_size_pairs]
num_continous_columns = len(df_categorized.columns)-len(categories)
y_range = (0, np.max(log_target_values)*1.2)
m = md.get_learner(
embedding_size_pairs,
num_continous_columns,
emb_drop=0.04,
out_sz=1,
szs=[1000, 500], # BatchNorm1
drops=[0.001, 0.01], # Dropout
y_range=y_range)
return m
# モデルのフィッティング
def run_fit(m, lr):
m.fit(lr, 2, metrics=[exp_rmspe, mse], cycle_len=1, cycle_mult=2)
#m.fit(lr, 2, metrics=[exp_rmspe, mse], cycle_len=2, cycle_mult=3)
#for u in range(10):
# m.fit(lr, 1, metrics=[exp_rmspe, mse])
return m
# チャート出力
def create_output_df(m, idx):
outp = pd.DataFrame(
{"Date":idx,
"actual":np.append([np.exp(m.data.trn_y)], [np.exp(m.data.val_y)]),
"predicted":np.append([np.nan for i in m.data.trn_y], [np.exp(m.predict())])})
return(outp)
if True:
VIX = False
change = False
path = gettempdir()+'/embedding_model/'
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
num_lags = 12
categories = ["Year", "Month" ,"Dayofyear", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"]
if VIX:
categories += ['Week', 'Is_month_end', 'Is_month_start']
train_ratio = 0.75
max_embedding_dimensions = 50
batch_size = 50
df = read_df(change=change, VIX=VIX)
df = add_date_features(df)
df = add_lags(df, num_lags)
df = arrange_data(df, num_lags, categories)
train_size, validation_index = make_validation_index(df, train_ratio)
df_categorized, log_target_values, nas, mapper = make_train_data(df)
m = build(path, df_categorized, log_target_values, validation_index, categories, max_embedding_dimensions, batch_size)
if False:
# 学習率の評価
m.lr_find()
m.sched.plot(1)
plt.show()
lr = 10e-3
m = run_fit(m, lr)
print(m)
print(m.model.embs[0])
print(m.model.embs[0].__class__)
print(m.model.embs[0].__class__.__dict__)
outp = create_output_df(m, df.index)
plt.plot(outp["actual"][-1000:])
plt.plot(outp["predicted"][-1000:])
plt.show()
# テストデータの分離と評価
# 変化率のモデル
# エンベッドした次元の出力
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment