vdutor/airline.py

## airline.py
# Copyright 2021 Vincent Dutordoir
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
# and associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial
# portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import numpy as np
import pandas as pd
import time


def airline(n=None):
    """
    Script adapted from James Hensman
    https://github.com/jameshensman/VFF/blob/master/experiments/airline/airline_vff_additive.py

    Data pickle file can be downloaded from:
    https://drive.google.com/file/d/1CnA6FYb8jNUckJt4VLz_ONA1KgV-bXwK/view?usp=sharing

    Returns the Airline delay dataset, containing a total of 5929413 rows.
    Each datapoint has 8 features.

    All features are rescaled to [-1, 1] and the target is normalized to be N(0, 1) distributed.

    :param n: int
        total dataset size (train + test size)
        n_train = 2/3 * n = train size
        n_test = 1/3 * n = test size
        Defaults to None, which corresponds to returning all rows (5929413 in total).
    :return:
        X: [n_train, 8], Y: [n_train, 1]
        XT: [n_test, 8], YT: [n_test, 1]
    """
    # Import the data
    data = pd.read_pickle('airline.pickle')

    # Convert time of day from hhmm to minutes since midnight
    data.ArrTime = 60*np.floor(data.ArrTime/100)+np.mod(data.ArrTime, 100)
    data.DepTime = 60*np.floor(data.DepTime/100)+np.mod(data.DepTime, 100)

    # Pick out the data
    Y = data['ArrDelay'].values
    names = [
        'Month', 'DayofMonth',
        'DayOfWeek', 'plane_age',
        'AirTime', 'Distance',
        'ArrTime', 'DepTime'
    ]
    X = data[names].values

    if n is None:
        n = len(X)
        assert n == len(Y)

    # Shuffle the data and only consider a subset of it
    perm = np.random.permutation(len(X))
    X = X[perm]
    Y = Y[perm]
    XT = X[int(2*n/3):n]
    YT = Y[int(2*n/3):n]
    X = X[:int(2*n/3)]
    Y = Y[:int(2*n/3)]

    # Normalize Y scale and offset
    Ymean = Y.mean()
    Ystd = Y.std()
    Y = (Y - Ymean) / Ystd
    Y = Y.reshape(-1, 1)
    YT = (YT - Ymean) / Ystd
    YT = YT.reshape(-1, 1)

    # Normalize X on [-1, 1]
    Xmin, Xmax = X.min(0), X.max(0)
    X = (X - Xmin) / (Xmax - Xmin)
    X = 2 * (X - 0.5)
    XT = (XT - Xmin) / (Xmax - Xmin)
    XT = 2 * (XT - 0.5)

    return X, Y, XT, YT
	# Copyright 2021 Vincent Dutordoir
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
	# and associated documentation files (the "Software"), to deal in the Software without restriction,
	# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all copies or substantial
	# portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
	# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
	# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	import numpy as np
	import pandas as pd
	import time


	def airline(n=None):
	"""
	Script adapted from James Hensman
	https://github.com/jameshensman/VFF/blob/master/experiments/airline/airline_vff_additive.py

	Data pickle file can be downloaded from:
	https://drive.google.com/file/d/1CnA6FYb8jNUckJt4VLz_ONA1KgV-bXwK/view?usp=sharing

	Returns the Airline delay dataset, containing a total of 5929413 rows.
	Each datapoint has 8 features.

	All features are rescaled to [-1, 1] and the target is normalized to be N(0, 1) distributed.

	:param n: int
	total dataset size (train + test size)
	n_train = 2/3 * n = train size
	n_test = 1/3 * n = test size
	Defaults to None, which corresponds to returning all rows (5929413 in total).
	:return:
	X: [n_train, 8], Y: [n_train, 1]
	XT: [n_test, 8], YT: [n_test, 1]
	"""
	# Import the data
	data = pd.read_pickle('airline.pickle')

	# Convert time of day from hhmm to minutes since midnight
	data.ArrTime = 60*np.floor(data.ArrTime/100)+np.mod(data.ArrTime, 100)
	data.DepTime = 60*np.floor(data.DepTime/100)+np.mod(data.DepTime, 100)

	# Pick out the data
	Y = data['ArrDelay'].values
	names = [
	'Month', 'DayofMonth',
	'DayOfWeek', 'plane_age',
	'AirTime', 'Distance',
	'ArrTime', 'DepTime'
	]
	X = data[names].values

	if n is None:
	n = len(X)
	assert n == len(Y)

	# Shuffle the data and only consider a subset of it
	perm = np.random.permutation(len(X))
	X = X[perm]
	Y = Y[perm]
	XT = X[int(2*n/3):n]
	YT = Y[int(2*n/3):n]
	X = X[:int(2*n/3)]
	Y = Y[:int(2*n/3)]

	# Normalize Y scale and offset
	Ymean = Y.mean()
	Ystd = Y.std()
	Y = (Y - Ymean) / Ystd
	Y = Y.reshape(-1, 1)
	YT = (YT - Ymean) / Ystd
	YT = YT.reshape(-1, 1)

	# Normalize X on [-1, 1]
	Xmin, Xmax = X.min(0), X.max(0)
	X = (X - Xmin) / (Xmax - Xmin)
	X = 2 * (X - 0.5)
	XT = (XT - Xmin) / (Xmax - Xmin)
	XT = 2 * (XT - 0.5)

	return X, Y, XT, YT