Paulescu/data_faster_and_better.py

## data_faster_and_better.py
def generate(from_date: str, to_date: str, overwrite: bool):
    """"""
    # avoid regenerating the data if it already exists and overwrite = False
    training_data_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
    if training_data_path.exists() and not overwrite:
        print(f'training data from {from_date} to {to_date} already existed. Skipping generation.')
        return

    # path where all the downloaded csv files are stored
    download_dir = Path(config['DATA_DIR']) / 'downloads'
    if not download_dir.exists():
        # create it if does not exist yet
        os.makedirs(download_dir)

    # list of days we want to download data for
    dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]

    pbar = tqdm(dates)
    for date in pbar:

        pbar.set_description(f'Processing {date}')

        output_path = download_dir / f'{date}.csv'
        if output_path.exists() and not overwrite:
            continue

        # fetch data from db for this date
        data = get_data_from_db(date=date)

        # save data in a csv file
        data.to_csv(output_path, index=False)

    # concatenate data for all dates
    all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])

    # and save it as csv file
    file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
    all_data.to_csv(file_path, index=False)
	def generate(from_date: str, to_date: str, overwrite: bool):
	""""""
	# avoid regenerating the data if it already exists and overwrite = False
	training_data_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
	if training_data_path.exists() and not overwrite:
	print(f'training data from {from_date} to {to_date} already existed. Skipping generation.')
	return

	# path where all the downloaded csv files are stored
	download_dir = Path(config['DATA_DIR']) / 'downloads'
	if not download_dir.exists():
	# create it if does not exist yet
	os.makedirs(download_dir)

	# list of days we want to download data for
	dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]

	pbar = tqdm(dates)
	for date in pbar:

	pbar.set_description(f'Processing {date}')

	output_path = download_dir / f'{date}.csv'
	if output_path.exists() and not overwrite:
	continue

	# fetch data from db for this date
	data = get_data_from_db(date=date)

	# save data in a csv file
	data.to_csv(output_path, index=False)

	# concatenate data for all dates
	all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])

	# and save it as csv file
	file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
	all_data.to_csv(file_path, index=False)