iamanvesh/build_continuous_data.py

## build_continuous_data.py
import os
import csv
import pandas as pd

from datetime import datetime

months = {
    'JAN': 1,
    'FEB': 2,
    'MAR': 3,
    'APR': 4,
    'MAY': 5,
    'JUN': 6,
    'JUL': 7,
    'AUG': 8,
    'SEP': 9,
    'OCT': 10,
    'NOV': 11,
    'DEC': 12
}

years = [
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
]

known_patterns = [
    '%d/%m/%Y %H:%M:%S',
    '%Y%m%d %H:%M',
    '%Y%m%d %H:%M:%S',
]

def validate_pattern(date, pattern):
    '''
    Check if the given date is in the required pattern.
    Args:
        date: str
        pattern: str
    Return:
        True/False
    '''
    try:
        datetime.strptime(date, pattern)
        return True
    except ValueError:
        return False


def parse_date(date):
    '''
    Parse the date string with one of the known formats
    Args:
        date: str
    Returns:
        parsed_date: datetime
    '''
    for pattern in known_patterns:
        if validate_pattern(date, pattern):
            parsed_date = datetime.strptime(date, pattern)
            if parsed_date.second == 59:
                # Some contracts are labeled with right edge (mm:59)
                parsed_date = parsed_date.replace(minute=parsed_date.minute, second=0)
            return parsed_date
    raise RuntimeError(f'Unable to parse {date}')


def read_file(file_name):
    '''
    Returns a list of rows in the csv file.
    Args:
        file_name Name of the file.
    Return:
        A Pandas DataFrame with the contents
    '''
    data = pd.read_csv(
        file_name,
        names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'OI'],
        sep=','
    )
    data['Time'] = data['Time'].apply(lambda x: f'0{x}' if x[0] == '9' else x)
    data['Date'] = data[['Date', 'Time']].apply(lambda x: ' '.join([str(_x) for _x in x]), axis=1)

    data['Date'] = data['Date'].apply(lambda x: parse_date(x))

    data = data.drop('Time', axis=1)
    return data


def get_file_name(month, year, symbol):
    '''
    Returns the path of the contract's file based on the name and the year.
    Assumes that the contracts dir tree is of the following format
    ./symbol
        2011/
            {symbol}JAN11.csv
    Args:
        month: str
        year: int
    Returns:
        path: str
    '''
    return f'./{symbol}/{year}/{symbol}{month}{year%2000}.csv'


def build_cont_data(expiry_hour, expiry_minute, cont_file_name, symbol):
    '''
    Iterates over all the contract files and builds a continuous futures contract
    with left-labeled timestamps.
    Args:
        expiry_hour: int -- the hour at which the current contract should be rolled over.
        expriy_minute: int -- the minute at which the current contract should be rolled over.
        cont_file_name: str -- path of the file to store the continuous data.
        symbol: str -- tradingsymbol of the instrument.
    Returns:
        None
    '''
    prev_expiry = None
    curr_expiry = None
    cont_data = pd.DataFrame(columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'OI'])

    for year in years:
        for month in months:
            print(month, year)
            file_name = get_file_name(month, year, symbol)

            if not os.path.exists(file_name):
                break

            curr_contract = read_file(file_name)
            curr_expiry = curr_contract.iloc[-1]['Date']
            curr_expiry = curr_expiry.replace(hour=expiry_hour, minute=expiry_minute)

            if prev_expiry == None:
                cont_data = cont_data.append(curr_contract[curr_contract['Date'] <= curr_expiry])
            else:
                cont_data = cont_data.append(
                    curr_contract[
                        (curr_contract['Date'] <= curr_expiry) &
                        (curr_contract['Date'] > prev_expiry)
                    ]
                )
            prev_expiry = curr_expiry

    cont_data.set_index('Date').to_csv(cont_file_name, float_format='%.2f')


if __name__ == '__main__':
    build_cont_data(15, 14, './nifty_continuous.csv', 'NIFTY')
	import os
	import csv
	import pandas as pd

	from datetime import datetime

	months = {
	'JAN': 1,
	'FEB': 2,
	'MAR': 3,
	'APR': 4,
	'MAY': 5,
	'JUN': 6,
	'JUL': 7,
	'AUG': 8,
	'SEP': 9,
	'OCT': 10,
	'NOV': 11,
	'DEC': 12
	}

	years = [
	2011,
	2012,
	2013,
	2014,
	2015,
	2016,
	2017,
	2018,
	2019,
	2020,
	]

	known_patterns = [
	'%d/%m/%Y %H:%M:%S',
	'%Y%m%d %H:%M',
	'%Y%m%d %H:%M:%S',
	]

	def validate_pattern(date, pattern):
	'''
	Check if the given date is in the required pattern.
	Args:
	date: str
	pattern: str
	Return:
	True/False
	'''
	try:
	datetime.strptime(date, pattern)
	return True
	except ValueError:
	return False


	def parse_date(date):
	'''
	Parse the date string with one of the known formats
	Args:
	date: str
	Returns:
	parsed_date: datetime
	'''
	for pattern in known_patterns:
	if validate_pattern(date, pattern):
	parsed_date = datetime.strptime(date, pattern)
	if parsed_date.second == 59:
	# Some contracts are labeled with right edge (mm:59)
	parsed_date = parsed_date.replace(minute=parsed_date.minute, second=0)
	return parsed_date
	raise RuntimeError(f'Unable to parse {date}')


	def read_file(file_name):
	'''
	Returns a list of rows in the csv file.
	Args:
	file_name Name of the file.
	Return:
	A Pandas DataFrame with the contents
	'''
	data = pd.read_csv(
	file_name,
	names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'OI'],
	sep=','
	)
	data['Time'] = data['Time'].apply(lambda x: f'0{x}' if x[0] == '9' else x)
	data['Date'] = data[['Date', 'Time']].apply(lambda x: ' '.join([str(_x) for _x in x]), axis=1)

	data['Date'] = data['Date'].apply(lambda x: parse_date(x))

	data = data.drop('Time', axis=1)
	return data


	def get_file_name(month, year, symbol):
	'''
	Returns the path of the contract's file based on the name and the year.
	Assumes that the contracts dir tree is of the following format
	./symbol
	2011/
	{symbol}JAN11.csv
	Args:
	month: str
	year: int
	Returns:
	path: str
	'''
	return f'./{symbol}/{year}/{symbol}{month}{year%2000}.csv'


	def build_cont_data(expiry_hour, expiry_minute, cont_file_name, symbol):
	'''
	Iterates over all the contract files and builds a continuous futures contract
	with left-labeled timestamps.
	Args:
	expiry_hour: int -- the hour at which the current contract should be rolled over.
	expriy_minute: int -- the minute at which the current contract should be rolled over.
	cont_file_name: str -- path of the file to store the continuous data.
	symbol: str -- tradingsymbol of the instrument.
	Returns:
	None
	'''
	prev_expiry = None
	curr_expiry = None
	cont_data = pd.DataFrame(columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'OI'])

	for year in years:
	for month in months:
	print(month, year)
	file_name = get_file_name(month, year, symbol)

	if not os.path.exists(file_name):
	break

	curr_contract = read_file(file_name)
	curr_expiry = curr_contract.iloc[-1]['Date']
	curr_expiry = curr_expiry.replace(hour=expiry_hour, minute=expiry_minute)

	if prev_expiry == None:
	cont_data = cont_data.append(curr_contract[curr_contract['Date'] <= curr_expiry])
	else:
	cont_data = cont_data.append(
	curr_contract[
	(curr_contract['Date'] <= curr_expiry) &
	(curr_contract['Date'] > prev_expiry)
	]
	)
	prev_expiry = curr_expiry

	cont_data.set_index('Date').to_csv(cont_file_name, float_format='%.2f')


	if __name__ == '__main__':
	build_cont_data(15, 14, './nifty_continuous.csv', 'NIFTY')