wassname/hydrosaver.ipynb

## hydrosaver.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              hydrosaver.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## hydrosaver.py
# -*- coding: utf-8 -*-
"""hydrosaver.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
"""

# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm


# %pylab inline
import numpy as np
import pandas as pd
import seaborn as sn
import os
from tqdm import tqdm

eps = 1e-6
seed = 42
np.random.seed(seed)


"""# Download data"""

# from https://stackoverflow.com/a/39225039/221742
import requests

def download_file_from_google_drive(id, destination):
    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

if not os.path.isdir('data/original'):
  os.makedirs('data/original')
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')

"""# Load data"""

# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_train_val = df_train_val.dropna(axis=1,  how='all') # drop the columns that are all NaN's
df_train_val = df_train_val.resample('1T').first()
df_train_val = df_train_val.drop('DIC88023.PV', 1)

df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_test = df_test.dropna(axis=1,  how='all') # drop the columns that are all NaN's

y_train_val = df_train_val.target
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data

x_test = df_test

# normalize the input columns
x_mean = x_train_val.mean()
x_std = x_train_val.mean()

x_train_val = (x_train_val - x_mean)/(x_std + eps)
x_test = (x_test - x_mean)/(x_std + eps)

# TODO I may want to normalize y too

print('mean', x_mean)
print('std', x_std)

# TPOT wont accept NaNs, so we either replace or drop
# Another approach would be to use unique numbers or extra columns for this
# Since we've normalized it, 0 is the nothing value. So let's use that


x_train_val = x_train_val.replace(np.nan, 0)
y_train_val = y_train_val.replace(np.nan, 0)
x_test = x_test.replace(np.nan, 0)

# since it's a timeseries the validation will be in the future
val_split_in = int(len(df_train_val.index)*0.85)
x_val = x_train_val[val_split_in:]
x_train = x_train_val[:val_split_in]
y_val = y_train_val[val_split_in:]
y_train = y_train_val[:val_split_in]

# convert to numpy
X_train = x_train.as_matrix()
y_train = y_train.as_matrix()
X_val = x_val.as_matrix()
y_val = y_val.as_matrix()
X_test = x_test.as_matrix()


"""# Have look into the data"""

df_train_val.info()

df_train_val.describe()

# You can use pandas profiling to get an overview of the data
import pandas_profiling
profile = pandas_profiling.ProfileReport(df_train_val[:2000])
profile.to_file(outputfile="/tmp/myoutputfile.html")
profile

"""# TPOT!

TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.

link: https://epistasislab.github.io/tpot/
"""

# Check data for TPOT compatability
from tpot.base import check_X_y
check_X_y(X_train, y_train, accept_sparse=True)
check_X_y(X_val, y_val, accept_sparse=True)
'ok'

# Ensure the it respects causality, by only giving each sample access to a window of past data
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)

def timeseries_to_seq(x, window=3):
  """
  Inputs:
  - x: shape (timeseries, features)
  - window: e.g. 3
  Outputs:
  - y: shape shape (window, batch, features)
  """
  x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
  y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
  return y

# For now I will just run on a subset of the data, for speed!
subset = 200
window=60*3
x=X_train[:subset]
y_stacked=y_train[:subset]
print(x.shape)
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))

from tpot import TPOTRegressor
# A quick run of TPOT with small population and short number of generation
# About 25 minutes to run
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
tpot.fit(X_train_stacked, y_stacked)
tpot.export('tpot_hydrosaver_export.py')

tpot.export('tpot_hydrosaver_export.py')

# What's the pipeline it saved?
# In this case it found that LassoLarsCV(normalize=False) performed best
#!cat tpot_hydrosaver_export.py

# final score
def rmse(y_pred, y_true):
    sqloss = (y_true-y_pred)**2
    return np.sqrt(sqloss.mean())

X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
y_pred = tpot.predict(X_val_stacked)
score = rmse(y_pred, y_val)
score

X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
y_pred = tpot.predict(X_test_stacked)

# save
s = pd.Series(y_submit, name='target')
assert len(s)==439140

import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
print('upload file', submission_file)

# and download
import google
google.colab.files.download(submission_file)
	# -- coding: utf-8 --
	"""hydrosaver.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
	"""

	# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
	#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
	#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm



	# %pylab inline
	import numpy as np
	import pandas as pd
	import seaborn as sn
	import os
	from tqdm import tqdm

	eps = 1e-6
	seed = 42
	np.random.seed(seed)



	"""# Download data"""

	# from https://stackoverflow.com/a/39225039/221742
	import requests

	def download_file_from_google_drive(id, destination):
	def get_confirm_token(response):
	for key, value in response.cookies.items():
	if key.startswith('download_warning'):
	return value

	return None

	def save_response_content(response, destination):
	CHUNK_SIZE = 32768

	with open(destination, "wb") as f:
	for chunk in response.iter_content(CHUNK_SIZE):
	if chunk: # filter out keep-alive new chunks
	f.write(chunk)

	URL = "https://docs.google.com/uc?export=download"

	session = requests.Session()

	response = session.get(URL, params = { 'id' : id }, stream = True)
	token = get_confirm_token(response)

	if token:
	params = { 'id' : id, 'confirm' : token }
	response = session.get(URL, params = params, stream = True)

	save_response_content(response, destination)

	if not os.path.isdir('data/original'):
	os.makedirs('data/original')
	download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
	download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')

	"""# Load data"""

	# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
	df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
	df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's
	df_train_val = df_train_val.resample('1T').first()
	df_train_val = df_train_val.drop('DIC88023.PV', 1)

	df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
	df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's

	y_train_val = df_train_val.target
	x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data

	x_test = df_test

	# normalize the input columns
	x_mean = x_train_val.mean()
	x_std = x_train_val.mean()

	x_train_val = (x_train_val - x_mean)/(x_std + eps)
	x_test = (x_test - x_mean)/(x_std + eps)

	# TODO I may want to normalize y too

	print('mean', x_mean)
	print('std', x_std)

	# TPOT wont accept NaNs, so we either replace or drop
	# Another approach would be to use unique numbers or extra columns for this
	# Since we've normalized it, 0 is the nothing value. So let's use that


	x_train_val = x_train_val.replace(np.nan, 0)
	y_train_val = y_train_val.replace(np.nan, 0)
	x_test = x_test.replace(np.nan, 0)

	# since it's a timeseries the validation will be in the future
	val_split_in = int(len(df_train_val.index)*0.85)
	x_val = x_train_val[val_split_in:]
	x_train = x_train_val[:val_split_in]
	y_val = y_train_val[val_split_in:]
	y_train = y_train_val[:val_split_in]

	# convert to numpy
	X_train = x_train.as_matrix()
	y_train = y_train.as_matrix()
	X_val = x_val.as_matrix()
	y_val = y_val.as_matrix()
	X_test = x_test.as_matrix()



	"""# Have look into the data"""

	df_train_val.info()

	df_train_val.describe()

	# You can use pandas profiling to get an overview of the data
	import pandas_profiling
	profile = pandas_profiling.ProfileReport(df_train_val[:2000])
	profile.to_file(outputfile="/tmp/myoutputfile.html")
	profile

	"""# TPOT!

	TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.

	link: https://epistasislab.github.io/tpot/
	"""

	# Check data for TPOT compatability
	from tpot.base import check_X_y
	check_X_y(X_train, y_train, accept_sparse=True)
	check_X_y(X_val, y_val, accept_sparse=True)
	'ok'

	# Ensure the it respects causality, by only giving each sample access to a window of past data
	# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)

	def timeseries_to_seq(x, window=3):
	"""
	Inputs:
	- x: shape (timeseries, features)
	- window: e.g. 3
	Outputs:
	- y: shape shape (window, batch, features)
	"""
	x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
	y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
	return y

	# For now I will just run on a subset of the data, for speed!
	subset = 200
	window=60*3
	x=X_train[:subset]
	y_stacked=y_train[:subset]
	print(x.shape)
	X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))

	from tpot import TPOTRegressor
	# A quick run of TPOT with small population and short number of generation
	# About 25 minutes to run
	tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
	tpot.fit(X_train_stacked, y_stacked)
	tpot.export('tpot_hydrosaver_export.py')

	tpot.export('tpot_hydrosaver_export.py')

	# What's the pipeline it saved?
	# In this case it found that LassoLarsCV(normalize=False) performed best
	#!cat tpot_hydrosaver_export.py

	# final score
	def rmse(y_pred, y_true):
	sqloss = (y_true-y_pred)**2
	return np.sqrt(sqloss.mean())

	X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
	y_pred = tpot.predict(X_val_stacked)
	score = rmse(y_pred, y_val)
	score

	X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
	y_pred = tpot.predict(X_test_stacked)

	# save
	s = pd.Series(y_submit, name='target')
	assert len(s)==439140

	import datetime
	ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

	submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
	s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
	print('upload file', submission_file)

	# and download
	import google
	google.colab.files.download(submission_file)