Last active
January 3, 2021 06:24
-
-
Save Joshuaek/b3182e996f9b851c811a182264ccd5a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import fnmatch | |
import logging | |
import papermill as pm | |
from datetime import datetime | |
import time | |
# This gist has been upgraded to a repository now! https://github.com/Joshuaek/NotebookScheduler | |
# The repo version has fixes and improvements! | |
# Read more here: https://productmetrics.net/blog/schedule-jupyter-notebooks/ | |
# | |
# Notebook Scheduler | |
# --------------------------------------- | |
# This script helps with the automated processing of Jupyter Notebooks via papermill (https://github.com/nteract/papermill/) | |
# The script can work in two ways: | |
# 1. With an external scheduler you can schedule execution of all notebooks in one of the hourly/daily/weekly directories | |
# 2. If no directory is specified it will use the scheduler library to run its own scheduler processing notebooks in each of the hourly/daily/weekly directories | |
# | |
# This means you can either use your operating systems task scheduler, or if its easier just let the script do the scheduling | |
# | |
# The folder structure is as follows | |
# | |
# <script_folder>/ | |
# ├── hourly/ | |
# │ ├── notebook1.ipynb | |
# │ ├── notebook2.ipynb | |
# │ └── snapshots/ | |
# │ ├── notebook1/ | |
# │ │ └── notebook1 2019-03-22 21:22:58.439005.ipynb | |
# │ └── notebook2/ | |
# │ └── notebook2 2019-03-22 21:23:14.5315001.ipynb | |
# ├── daily/ | |
# │ ├── notebook1.ipynb | |
# │ ├── notebook2.ipynb | |
# │ └── snapshots/ | |
# │ ├── notebook1/ | |
# │ │ └── notebook1 2019-03-22 21:22:58.439005.ipynb | |
# │ └── notebook2/ | |
# │ └── notebook2 2019-03-22 21:23:14.5315001.ipynb | |
# └── weekly/ | |
# ├── notebook1.ipynb | |
# ├── notebook2.ipynb | |
# └── snapshots/ | |
# ├── notebook1/ | |
# │ └── notebook1 2019-03-22 21:22:58.439005.ipynb | |
# └── notebook2/ | |
# └── notebook2 2019-03-22 21:23:14.5315001.ipynb | |
# | |
# Folders will be created if they don't already exists in the same folder as this script | |
# To use, just put notebooks into the hourly / daily / weekly folders and run this script without any arguments | |
snapshotDir = 'snapshots' | |
def findFiles(directory, pattern): | |
# Lists all files in the specified directory that match the specified pattern | |
for filename in os.listdir(directory): | |
if fnmatch.fnmatch(filename.lower(), pattern): | |
yield os.path.join(directory, filename) | |
def processNotebooks(notebookDirectory, params=None): | |
logging.info('Processing ' + notebookDirectory) | |
# Each time a notebook is processed a snapshot is saved to a snapshot sub-directory | |
# This checks the sub-directory exists and creates it if not | |
if os.path.isdir(os.path.join(notebookDirectory,snapshotDir)) == False: | |
os.mkdir(os.path.join(notebookDirectory,snapshotDir)) | |
for file in findFiles(notebookDirectory, '*.ipynb'): | |
try: | |
nb = os.path.basename(file) | |
now = datetime.now() | |
# Within the snapshot directory, each notebook output is stored in its own sub-directory | |
notebookSnapshot = os.path.join(notebookDirectory, snapshotDir, nb.split('.ipynb')[0]) | |
if os.path.isdir(notebookSnapshot) == False: | |
os.mkdir(notebookSnapshot) | |
# The snapshot file includes a timestamp | |
output_file = os.path.join(notebookSnapshot, nb.split('.ipynb')[0] + " " + str(now) + ".ipynb") | |
# Execute the notebook and save the snapshot | |
pm.execute_notebook( | |
file, | |
output_file, | |
parameters=params | |
) | |
except Exception: | |
# If any errors occur with the notebook processing they will be logged to the log file | |
logging.exception("Error processing notebook") | |
if __name__ == '__main__': | |
# Ensure we're running in the same directory as the script | |
os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
# Set up logger to display to screen and file | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
filename='notebooks.log') | |
console = logging.StreamHandler() | |
console.setLevel(logging.INFO) | |
logging.getLogger('').addHandler(console) | |
# Check if the subfolders for notebooks exist, and create them if they don't | |
for directory in ['daily','hourly','weekly']: | |
if os.path.isdir(directory) == False: | |
os.mkdir(directory) | |
# Get optional directory passed in via command line. If this is specified, then we just process the requested directory. | |
# This is useful if you're scheduling the processing with an external task scheduler | |
# If directory is not specified, then we'll set up our own scheduler and process the tasks | |
parser = argparse.ArgumentParser(description = "Description for my parser") | |
parser.add_argument("-d", "--directory", help = "Which set of notebooks to process - e.g. hourly", required = False, default = False) | |
argument = parser.parse_args() | |
if argument.directory: | |
# If a directory has been specified, we'll just process that one directory now and exit | |
processNotebooks(argument.directory) | |
else: | |
# Only require the schedule module if we're using the internal scheduler | |
# Install this with pip install schedule | |
import schedule | |
# If no directory has been specified, schedule the processing and execute | |
schedule.every().hour.at(':40').do(processNotebooks, notebookDirectory='hourly') | |
schedule.every().day.at('13:15').do(processNotebooks, notebookDirectory='daily') | |
schedule.every().sunday.at('13:15').do(processNotebooks, notebookDirectory='weekly') | |
# Run the scheduled tasks | |
while True: | |
schedule.run_pending() | |
time.sleep(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment