fanannan/vxx_intraday(ipnb).py

## vxx_intraday(ipnb).py
%matplotlib inline
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import glob
from joblib import Memory
from pandas_datareader import wb
import datetime as dt
memory = Memory('/tmp/')

mpl.rcParams.update({'font.size': 10})
DATA_FILE_PATH = './'

def load(file_path):
    df = None
    try:
        df = pd.read_csv(file_path)
    except:
        pass
    return df

def get_file_list(path):
    l = list()
    for d in path:
        p = os.path.join(d, '*.txt*')
        l += glob.glob(p)
    return l

@memory.cache
def read_all(path):
    return [load(f) for f in get_file_list(path)]

def read_date(s):
    dts = dt.datetime.strptime(s, '%Y-%m-%d')
    return dt.date(dts.year, dts.month, dts.day)

def to_weekday(s):
    td = read_date(s)
    return td.weekday()

def find_business_day(s, lis):
    today = read_date(s)
    if today.weekday() != 4:
        next_day = today+dt.timedelta(days=1)
    else:
        next_day = today+dt.timedelta(days=3)
    nds = next_day.strftime('%Y-%m-%d')
    return nds if nds in lis else None

#@memory.cache
def conv_all(path):
    time_list = [10,11,12,13,14,15,16]
    l = list()
    dfs = read_all(path)
    tt =set()
    for df in dfs:
        if df is not None:
            for date in set(df['Date'].values):
                dfd = df[df['Date']==date]
                next_date = find_business_day(date, df['Date'].values)
                # 始値（夏時間対応注意）
                flag_summer = '14:35:00' in dfd.columns
                df_open = dfd[dfd['Time'] == ('14:35:00' if flag_summer else '15:35:00')]
                dfa = df_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p0930'})
                # 時刻別データの列名付け替え
                lis = [dfa]
                for time in time_list:
                    rtime = str(time + ((14-9) if flag_summer else (15-9))) + ':00:00'
                    ptime = 'p'+str(time)+ '00'
                    #print(rtime)
                    p = dfd[dfd['Time']==rtime][['Date', 'Close']].set_index('Date').rename(columns={'Close': ptime})
                    lis.append(p)
                if next_date is not None:
                    dfn = df[df['Date']==next_date] if next_date is not None else None
                    df_next_open = dfn[dfn['Time'] == ('14:35:00' if flag_summer else '15:35:00')] # 夏時間と冬時間の境目は無視
                    q = df_next_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p3330'})
                    q = q.values[0] if len(q) > 0 else np.nan
                #    q.index = [date]
                #else:
                #    q = pd.DataFrame([np.nan], index=[date], columns=['p0930n'])
                q = pd.DataFrame([np.nan if next_date is None else q], index=[date], columns=['p3330'])
                lis.append(q)
                #print(lis)
                df_prices = pd.concat(lis, axis=1)
                # 変化率
                open_price = df_prices['p0930']
                last_price = open_price
                for time in time_list:
                    ptime = 'p'+str(time)+ '00'
                    ctime = 'c'+str(time)+ '00'
                    rtime = 'r'+str(time)+ '00'
                    p = df_prices[ptime]
                    df_prices[ctime] = (p / open_price).apply(np.log)
                    df_prices[rtime] = (p / last_price).apply(np.log)
                    del df_prices[ptime]
                    last_price = p
                if True:
                    ptime = 'p3330'
                    ctime = 'c3330'
                    rtime = 'r3330'
                    p = df_prices[ptime]
                    df_prices[ctime] = (p / open_price).apply(np.log)
                    df_prices[rtime] = (p / last_price).apply(np.log)
                    del df_prices[ptime]
                #print(df_prices)
                #del df_prices['p3330']
                df_prices = df_prices.dropna()
                if len(df_prices.index) > 0:
                    l.append(df_prices)
    r = pd.concat(l, axis = 0).drop_duplicates()
    #print(sorted(list(tt)))
    return r

df_data = conv_all(DATA_FILE_PATH)

PERCENTILES = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

def get_cumulative(df):
    return df.describe(percentiles=PERCENTILES)[['c1000','c1100','c1200','c1300','c1400','c1500','c1600', 'c3330']]

def get_relative(df):
    return df.describe(percentiles=PERCENTILES)[['r1000','r1100','r1200','r1300','r1400','r1500','r1600', 'r3330']]

def explain(df_data, opt=""):
    df_cumulative = get_cumulative(df_data)
    df_relative = get_relative(df_data)
    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(1,2,1)
    ax.set_ylim((-0.05, 0.05))
    draw(ax, df_cumulative, 'cumulative', opt)
    bx = fig.add_subplot(1,2,2)
    bx.set_ylim((-0.02, 0.02))
    draw(bx, df_relative, 'relative', opt)

def draw(ax, df_summary, title, opt):
    for c in df_summary.columns:
        df_summary.rename(columns={c: int(c[1:])}, inplace=True)
    ax.set_title(opt+title+' movement')
    ax.grid()
    if title != 'relative':
        ya = [0]
        ia = [930]
    else:
        ya = ia = []
    y = ya+df_summary.ix['mean',:].values.tolist()
    i = ia+df_summary.ix['mean',:].index.tolist()
    ax.plot(i, y, linewidth=4)
    #ax.plot(df_summary.ix['mean',:], linewidth=4)
    for p in PERCENTILES:
        y = ya+df_summary.ix['{}%'.format(int(p*100)),:].values.tolist()
        ax.plot(i, y)

explain(df_data)

def explain_by_weekday(df_data):
    df_wd = df_data.copy()
    df_wd['weekday'] = [to_weekday(t) for t in df_wd.index]
    for d in range(5):
        df = df_wd[df_wd['weekday'] == d]
        print(d, len(df))
        explain(df, ['mon','tue','wed','thr','fri'][d]+'day / ')

explain_by_weekday(df_data)

def to_day(s):
    tdt = dt.datetime.strptime(s, '%Y-%m-%d')
    return tdt.day

def explain_by_day(df_data):
    df_wd = df_data.copy()
    df_wd['day'] = [to_day(t) for t in df_wd.index]
    df = df_wd[df_wd['day']<=10]
    explain(df, 'early month / ')
    df = df_wd[df_wd['day']>10]
    df = df[df['day']<20]
    explain(df, 'mid month / ')
    df = df_wd[df_wd['day']>=20]
    explain(df, 'late month / ')

explain_by_day(df_data)
	%matplotlib inline
	import os
	import numpy as np
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import pandas as pd
	import glob
	from joblib import Memory
	from pandas_datareader import wb
	import datetime as dt
	memory = Memory('/tmp/')

	mpl.rcParams.update({'font.size': 10})
	DATA_FILE_PATH = './'

	def load(file_path):
	df = None
	try:
	df = pd.read_csv(file_path)
	except:
	pass
	return df

	def get_file_list(path):
	l = list()
	for d in path:
	p = os.path.join(d, '.txt')
	l += glob.glob(p)
	return l

	@memory.cache
	def read_all(path):
	return [load(f) for f in get_file_list(path)]

	def read_date(s):
	dts = dt.datetime.strptime(s, '%Y-%m-%d')
	return dt.date(dts.year, dts.month, dts.day)

	def to_weekday(s):
	td = read_date(s)
	return td.weekday()

	def find_business_day(s, lis):
	today = read_date(s)
	if today.weekday() != 4:
	next_day = today+dt.timedelta(days=1)
	else:
	next_day = today+dt.timedelta(days=3)
	nds = next_day.strftime('%Y-%m-%d')
	return nds if nds in lis else None

	#@memory.cache
	def conv_all(path):
	time_list = [10,11,12,13,14,15,16]
	l = list()
	dfs = read_all(path)
	tt =set()
	for df in dfs:
	if df is not None:
	for date in set(df['Date'].values):
	dfd = df[df['Date']==date]
	next_date = find_business_day(date, df['Date'].values)
	# 始値（夏時間対応注意）
	flag_summer = '14:35:00' in dfd.columns
	df_open = dfd[dfd['Time'] == ('14:35:00' if flag_summer else '15:35:00')]
	dfa = df_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p0930'})
	# 時刻別データの列名付け替え
	lis = [dfa]
	for time in time_list:
	rtime = str(time + ((14-9) if flag_summer else (15-9))) + ':00:00'
	ptime = 'p'+str(time)+ '00'
	#print(rtime)
	p = dfd[dfd['Time']==rtime][['Date', 'Close']].set_index('Date').rename(columns={'Close': ptime})
	lis.append(p)
	if next_date is not None:
	dfn = df[df['Date']==next_date] if next_date is not None else None
	df_next_open = dfn[dfn['Time'] == ('14:35:00' if flag_summer else '15:35:00')] # 夏時間と冬時間の境目は無視
	q = df_next_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p3330'})
	q = q.values[0] if len(q) > 0 else np.nan
	# q.index = [date]
	#else:
	# q = pd.DataFrame([np.nan], index=[date], columns=['p0930n'])
	q = pd.DataFrame([np.nan if next_date is None else q], index=[date], columns=['p3330'])
	lis.append(q)
	#print(lis)
	df_prices = pd.concat(lis, axis=1)
	# 変化率
	open_price = df_prices['p0930']
	last_price = open_price
	for time in time_list:
	ptime = 'p'+str(time)+ '00'
	ctime = 'c'+str(time)+ '00'
	rtime = 'r'+str(time)+ '00'
	p = df_prices[ptime]
	df_prices[ctime] = (p / open_price).apply(np.log)
	df_prices[rtime] = (p / last_price).apply(np.log)
	del df_prices[ptime]
	last_price = p
	if True:
	ptime = 'p3330'
	ctime = 'c3330'
	rtime = 'r3330'
	p = df_prices[ptime]
	df_prices[ctime] = (p / open_price).apply(np.log)
	df_prices[rtime] = (p / last_price).apply(np.log)
	del df_prices[ptime]
	#print(df_prices)
	#del df_prices['p3330']
	df_prices = df_prices.dropna()
	if len(df_prices.index) > 0:
	l.append(df_prices)
	r = pd.concat(l, axis = 0).drop_duplicates()
	#print(sorted(list(tt)))
	return r

	df_data = conv_all(DATA_FILE_PATH)

	PERCENTILES = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

	def get_cumulative(df):
	return df.describe(percentiles=PERCENTILES)[['c1000','c1100','c1200','c1300','c1400','c1500','c1600', 'c3330']]

	def get_relative(df):
	return df.describe(percentiles=PERCENTILES)[['r1000','r1100','r1200','r1300','r1400','r1500','r1600', 'r3330']]

	def explain(df_data, opt=""):
	df_cumulative = get_cumulative(df_data)
	df_relative = get_relative(df_data)
	fig = plt.figure(figsize=(12, 4))
	ax = fig.add_subplot(1,2,1)
	ax.set_ylim((-0.05, 0.05))
	draw(ax, df_cumulative, 'cumulative', opt)
	bx = fig.add_subplot(1,2,2)
	bx.set_ylim((-0.02, 0.02))
	draw(bx, df_relative, 'relative', opt)

	def draw(ax, df_summary, title, opt):
	for c in df_summary.columns:
	df_summary.rename(columns={c: int(c[1:])}, inplace=True)
	ax.set_title(opt+title+' movement')
	ax.grid()
	if title != 'relative':
	ya = [0]
	ia = [930]
	else:
	ya = ia = []
	y = ya+df_summary.ix['mean',:].values.tolist()
	i = ia+df_summary.ix['mean',:].index.tolist()
	ax.plot(i, y, linewidth=4)
	#ax.plot(df_summary.ix['mean',:], linewidth=4)
	for p in PERCENTILES:
	y = ya+df_summary.ix['{}%'.format(int(p*100)),:].values.tolist()
	ax.plot(i, y)

	explain(df_data)

	def explain_by_weekday(df_data):
	df_wd = df_data.copy()
	df_wd['weekday'] = [to_weekday(t) for t in df_wd.index]
	for d in range(5):
	df = df_wd[df_wd['weekday'] == d]
	print(d, len(df))
	explain(df, ['mon','tue','wed','thr','fri'][d]+'day / ')

	explain_by_weekday(df_data)

	def to_day(s):
	tdt = dt.datetime.strptime(s, '%Y-%m-%d')
	return tdt.day

	def explain_by_day(df_data):
	df_wd = df_data.copy()
	df_wd['day'] = [to_day(t) for t in df_wd.index]
	df = df_wd[df_wd['day']<=10]
	explain(df, 'early month / ')
	df = df_wd[df_wd['day']>10]
	df = df[df['day']<20]
	explain(df, 'mid month / ')
	df = df_wd[df_wd['day']>=20]
	explain(df, 'late month / ')

	explain_by_day(df_data)