Created
May 4, 2018 07:27
-
-
Save fanannan/075fc692f62ba49decace2b1c2c20467 to your computer and use it in GitHub Desktop.
Embedding category labels to multiple dimensions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
import shutil | |
import pandas as pd | |
import numpy as np | |
import math | |
import quandl | |
from fastai.structured import add_datepart, proc_df | |
from fastai.column_data import ColumnarModelData | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
from joblib import Memory | |
from tempfile import gettempdir | |
memory = Memory(cachedir=gettempdir(), verbose=0) | |
# 推定対象の時系列データを取得する | |
@memory.cache | |
def read_df(target_column='target', change=False, VIX=True): | |
if VIX: | |
if True: | |
df = quandl.get("CBOE/VIX") | |
ORIGINAL_TARGET = "VIX Close" | |
else: | |
raise Exception() | |
# "Date" を通常の列項目にする | |
df = df.reset_index() | |
else: | |
# for the case of Sun Spots | |
url="https://raw.githubusercontent.com/bblais/Python-for-Science/master/examples/spot_num.txt" | |
df = pd.read_csv(url, sep='\s+') | |
df["Month"] = df["YEAR"].apply(str)+"-"+df["MON"].apply(str) | |
df["Date"] = pd.to_datetime(df["Month"], format='%Y-%m') | |
ORIGINAL_TARGET = 'SSN' | |
# 加工 | |
df = df.replace([np.inf, -np.inf], np.nan) | |
df = df.fillna(method='ffill').dropna() | |
df = df.rename(columns={ORIGINAL_TARGET: target_column}) | |
if change: | |
df[target_column] = df[target_column].pct_change()+1.0 | |
return df | |
# 日付情報の展開 | |
def add_date_features(df): | |
add_datepart(df, "Date") | |
print('date features: ', df.columns) | |
return df | |
# ラグ項目の作成 | |
def make_lag_labels(num_lags): | |
return [f"lag_{i}" for i in range(1, num_lags+1)] | |
# ラグ項目の追加 | |
def add_lags(df, num_lags, target_column='target'): | |
for i, lag_label in enumerate(make_lag_labels(num_lags)): | |
df[lag_label] = df[target_column].shift(i+1).astype('float32') | |
return df | |
# カテゴリー項目の型確定(整数)と使用項目の特定 | |
def arrange_data(df, num_lags, categories, target_column='target'): | |
for v in categories: | |
df[v] = df[v].astype('category').cat.as_ordered() | |
lag_labels = make_lag_labels(num_lags) | |
df_categorized = df[categories+lag_labels+[target_column]] | |
return df_categorized | |
# RMSPE(対数値を戻して評価) | |
def exp_rmspe(pred_values, target_values): | |
inv_targets = np.exp(target_values) | |
inv_preds = np.exp(pred_values) | |
pct_var = (inv_targets - inv_preds)/inv_targets | |
return math.sqrt((pct_var**2).mean()) | |
# MSE(対数値を戻して評価) | |
def mse(pred_values, target_values): | |
inv_targets = np.exp(target_values) | |
inv_preds = np.exp(pred_values) | |
diff = inv_preds-inv_targets | |
return math.sqrt((diff**2).mean()) | |
# バリデーション用の行番号を返す | |
def make_validation_index(df, train_ratio): | |
size = len(df) | |
train_size = int(size * train_ratio) | |
validation_index = list(range(train_size, len(df))) | |
return train_size, validation_index | |
# 学習用データを用意する | |
def make_train_data(df, target_column='target'): | |
# スケーリングをして、学習用データを作成する | |
# nas, mapperはスケーリング等の情報として、未知データの処理の際用いる | |
df_train, target_values, nas, mapper = proc_df(df.copy(), target_column, do_scale=True, skip_flds=[]) | |
# 対数で処理するので、下駄を履かせる | |
log_target_values = np.log(target_values + 0.01).astype(np.float32) | |
return df_train, log_target_values, nas, mapper | |
# モデルを構築する | |
def build(path, df_categorized, log_target_values, validation_index, categories, max_embedding_dimensions, batch_size): | |
md = ColumnarModelData.from_data_frame(path, validation_index, df_categorized, log_target_values, cat_flds=categories, bs=batch_size) | |
# カテゴリー名とそのラベル数のタプルのリスト | |
cat_size_pairs = [(c, max(df_categorized[c])+1) for c in categories] | |
# カテゴリー名とそのラベル数のタプルのリスト | |
embedding_size_pairs = [(num, min(max_embedding_dimensions, (num+1)//2)) for _, num in cat_size_pairs] | |
num_continous_columns = len(df_categorized.columns)-len(categories) | |
y_range = (0, np.max(log_target_values)*1.2) | |
m = md.get_learner( | |
embedding_size_pairs, | |
num_continous_columns, | |
emb_drop=0.04, | |
out_sz=1, | |
szs=[1000, 500], # BatchNorm1 | |
drops=[0.001, 0.01], # Dropout | |
y_range=y_range) | |
return m | |
# モデルのフィッティング | |
def run_fit(m, lr): | |
m.fit(lr, 2, metrics=[exp_rmspe, mse], cycle_len=1, cycle_mult=2) | |
#m.fit(lr, 2, metrics=[exp_rmspe, mse], cycle_len=2, cycle_mult=3) | |
#for u in range(10): | |
# m.fit(lr, 1, metrics=[exp_rmspe, mse]) | |
return m | |
# チャート出力 | |
def create_output_df(m, idx): | |
outp = pd.DataFrame( | |
{"Date":idx, | |
"actual":np.append([np.exp(m.data.trn_y)], [np.exp(m.data.val_y)]), | |
"predicted":np.append([np.nan for i in m.data.trn_y], [np.exp(m.predict())])}) | |
return(outp) | |
if True: | |
VIX = False | |
change = False | |
path = gettempdir()+'/embedding_model/' | |
if os.path.exists(path): | |
shutil.rmtree(path) | |
os.makedirs(path) | |
num_lags = 12 | |
categories = ["Year", "Month" ,"Dayofyear", "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"] | |
if VIX: | |
categories += ['Week', 'Is_month_end', 'Is_month_start'] | |
train_ratio = 0.75 | |
max_embedding_dimensions = 50 | |
batch_size = 50 | |
df = read_df(change=change, VIX=VIX) | |
df = add_date_features(df) | |
df = add_lags(df, num_lags) | |
df = arrange_data(df, num_lags, categories) | |
train_size, validation_index = make_validation_index(df, train_ratio) | |
df_categorized, log_target_values, nas, mapper = make_train_data(df) | |
m = build(path, df_categorized, log_target_values, validation_index, categories, max_embedding_dimensions, batch_size) | |
if False: | |
# 学習率の評価 | |
m.lr_find() | |
m.sched.plot(1) | |
plt.show() | |
lr = 10e-3 | |
m = run_fit(m, lr) | |
print(m) | |
print(m.model.embs[0]) | |
print(m.model.embs[0].__class__) | |
print(m.model.embs[0].__class__.__dict__) | |
outp = create_output_df(m, df.index) | |
plt.plot(outp["actual"][-1000:]) | |
plt.plot(outp["predicted"][-1000:]) | |
plt.show() | |
# テストデータの分離と評価 | |
# 変化率のモデル | |
# エンベッドした次元の出力 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment