Skip to content

Instantly share code, notes, and snippets.

@Paulescu
Created October 11, 2021 08:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Paulescu/01ce2cabab82981258bc2a36db7718b5 to your computer and use it in GitHub Desktop.
Save Paulescu/01ce2cabab82981258bc2a36db7718b5 to your computer and use it in GitHub Desktop.
data_faster_and_better.py
def generate(from_date: str, to_date: str, overwrite: bool):
""""""
# avoid regenerating the data if it already exists and overwrite = False
training_data_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
if training_data_path.exists() and not overwrite:
print(f'training data from {from_date} to {to_date} already existed. Skipping generation.')
return
# path where all the downloaded csv files are stored
download_dir = Path(config['DATA_DIR']) / 'downloads'
if not download_dir.exists():
# create it if does not exist yet
os.makedirs(download_dir)
# list of days we want to download data for
dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]
pbar = tqdm(dates)
for date in pbar:
pbar.set_description(f'Processing {date}')
output_path = download_dir / f'{date}.csv'
if output_path.exists() and not overwrite:
continue
# fetch data from db for this date
data = get_data_from_db(date=date)
# save data in a csv file
data.to_csv(output_path, index=False)
# concatenate data for all dates
all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])
# and save it as csv file
file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
all_data.to_csv(file_path, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment