Skip to content

Instantly share code, notes, and snippets.

@Paulescu
Last active October 11, 2021 08:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Paulescu/8b2bf3fbdde41903e27de5b04e001e0e to your computer and use it in GitHub Desktop.
Save Paulescu/8b2bf3fbdde41903e27de5b04e001e0e to your computer and use it in GitHub Desktop.
data.py
QUERY_TEMPLATE = """
SELECT
ts
,feature1
,feature2
,...
,target
FROM
your_table WITH(NOLOCK)
WHERE
ts BETWEEN '{date}' AND DATEADD(day, 1, '{date}')
"""
def generate(from_date: str, to_date: str):
"""
Fetches data from the DB, day by day, and stores it in separate CSV files.
Then, compacts all the CSV files into a final training set.
"""
# path where all the downloaded csv files are stored
download_dir = Path(config['DATA_DIR']) / 'downloads'
if not download_dir.exists():
# create it if does not exist yet
os.makedirs(download_dir)
# list of days we want to download data for
dates = [d.strftime("%Y-%m-%d") for d in pd.date_range(from_date, to_date)]
for date in dates:
# fetch data from db for this date
data = get_data_from_db(date=date)
# save data in a csv file
output_path = download_dir / f'{date}.csv'
data.to_csv(output_path, index=False)
# concatenate data for all dates
all_data = pd.concat([pd.read_csv(download_dir / f'{date}.csv') for date in dates])
# and save it as csv file
file_path = Path(config['DATA_DIR']) / f'training_data_{from_date}_{to_date}.csv'
all_data.to_csv(file_path, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment