Paulescu/data.py

## data.py
QUERY_TEMPLATE = """
SELECT
    ts
    ,feature1
    ,feature2
    ,...
    ,target
FROM
    your_table WITH(NOLOCK)
WHERE
    ts BETWEEN '{date}' AND DATEADD(day, 1, '{date}')
"""


def generate(from_date: str, to_date: str):
    """
    Fetches data from the DB, day by day, and stores it in separate CSV files.
    Then, compacts all the CSV files into a final training set.
    """
    # path where all the downloaded csv files are stored
    download_dir = Path(config['DATA_DIR']) / 'downloads'
    if not download_dir.exists():
        # create it if does not exist yet
        os.makedirs(download_dir)

    # list of days we want to download data for
    dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]

    for date in dates:

        # fetch data from db for this date
        data = get_data_from_db(date=date)

        # save data in a csv file
        output_path = download_dir / f'{date}.csv'
        data.to_csv(output_path, index=False)

    # concatenate data for all dates
    all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])

    # and save it as csv file
    file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
    all_data.to_csv(file_path, index=False)
	QUERY_TEMPLATE = """
	SELECT
	ts
	,feature1
	,feature2
	,...
	,target
	FROM
	your_table WITH(NOLOCK)
	WHERE
	ts BETWEEN '{date}' AND DATEADD(day, 1, '{date}')
	"""


	def generate(from_date: str, to_date: str):
	"""
	Fetches data from the DB, day by day, and stores it in separate CSV files.
	Then, compacts all the CSV files into a final training set.
	"""
	# path where all the downloaded csv files are stored
	download_dir = Path(config['DATA_DIR']) / 'downloads'
	if not download_dir.exists():
	# create it if does not exist yet
	os.makedirs(download_dir)

	# list of days we want to download data for
	dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]

	for date in dates:

	# fetch data from db for this date
	data = get_data_from_db(date=date)

	# save data in a csv file
	output_path = download_dir / f'{date}.csv'
	data.to_csv(output_path, index=False)

	# concatenate data for all dates
	all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])

	# and save it as csv file
	file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
	all_data.to_csv(file_path, index=False)