Skip to content

Instantly share code, notes, and snippets.

@Joshuaek
Last active January 3, 2021 06:24
Show Gist options
  • Save Joshuaek/b3182e996f9b851c811a182264ccd5a4 to your computer and use it in GitHub Desktop.
Save Joshuaek/b3182e996f9b851c811a182264ccd5a4 to your computer and use it in GitHub Desktop.
import os
import sys
import argparse
import fnmatch
import logging
import papermill as pm
from datetime import datetime
import time
# This gist has been upgraded to a repository now! https://github.com/Joshuaek/NotebookScheduler
# The repo version has fixes and improvements!
# Read more here: https://productmetrics.net/blog/schedule-jupyter-notebooks/
#
# Notebook Scheduler
# ---------------------------------------
# This script helps with the automated processing of Jupyter Notebooks via papermill (https://github.com/nteract/papermill/)
# The script can work in two ways:
# 1. With an external scheduler you can schedule execution of all notebooks in one of the hourly/daily/weekly directories
# 2. If no directory is specified it will use the scheduler library to run its own scheduler processing notebooks in each of the hourly/daily/weekly directories
#
# This means you can either use your operating systems task scheduler, or if its easier just let the script do the scheduling
#
# The folder structure is as follows
#
# <script_folder>/
# ├── hourly/
# │ ├── notebook1.ipynb
# │ ├── notebook2.ipynb
# │ └── snapshots/
# │ ├── notebook1/
# │ │ └── notebook1 2019-03-22 21:22:58.439005.ipynb
# │ └── notebook2/
# │ └── notebook2 2019-03-22 21:23:14.5315001.ipynb
# ├── daily/
# │ ├── notebook1.ipynb
# │ ├── notebook2.ipynb
# │ └── snapshots/
# │ ├── notebook1/
# │ │ └── notebook1 2019-03-22 21:22:58.439005.ipynb
# │ └── notebook2/
# │ └── notebook2 2019-03-22 21:23:14.5315001.ipynb
# └── weekly/
# ├── notebook1.ipynb
# ├── notebook2.ipynb
# └── snapshots/
# ├── notebook1/
# │ └── notebook1 2019-03-22 21:22:58.439005.ipynb
# └── notebook2/
# └── notebook2 2019-03-22 21:23:14.5315001.ipynb
#
# Folders will be created if they don't already exists in the same folder as this script
# To use, just put notebooks into the hourly / daily / weekly folders and run this script without any arguments
snapshotDir = 'snapshots'
def findFiles(directory, pattern):
# Lists all files in the specified directory that match the specified pattern
for filename in os.listdir(directory):
if fnmatch.fnmatch(filename.lower(), pattern):
yield os.path.join(directory, filename)
def processNotebooks(notebookDirectory, params=None):
logging.info('Processing ' + notebookDirectory)
# Each time a notebook is processed a snapshot is saved to a snapshot sub-directory
# This checks the sub-directory exists and creates it if not
if os.path.isdir(os.path.join(notebookDirectory,snapshotDir)) == False:
os.mkdir(os.path.join(notebookDirectory,snapshotDir))
for file in findFiles(notebookDirectory, '*.ipynb'):
try:
nb = os.path.basename(file)
now = datetime.now()
# Within the snapshot directory, each notebook output is stored in its own sub-directory
notebookSnapshot = os.path.join(notebookDirectory, snapshotDir, nb.split('.ipynb')[0])
if os.path.isdir(notebookSnapshot) == False:
os.mkdir(notebookSnapshot)
# The snapshot file includes a timestamp
output_file = os.path.join(notebookSnapshot, nb.split('.ipynb')[0] + " " + str(now) + ".ipynb")
# Execute the notebook and save the snapshot
pm.execute_notebook(
file,
output_file,
parameters=params
)
except Exception:
# If any errors occur with the notebook processing they will be logged to the log file
logging.exception("Error processing notebook")
if __name__ == '__main__':
# Ensure we're running in the same directory as the script
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# Set up logger to display to screen and file
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='notebooks.log')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logging.getLogger('').addHandler(console)
# Check if the subfolders for notebooks exist, and create them if they don't
for directory in ['daily','hourly','weekly']:
if os.path.isdir(directory) == False:
os.mkdir(directory)
# Get optional directory passed in via command line. If this is specified, then we just process the requested directory.
# This is useful if you're scheduling the processing with an external task scheduler
# If directory is not specified, then we'll set up our own scheduler and process the tasks
parser = argparse.ArgumentParser(description = "Description for my parser")
parser.add_argument("-d", "--directory", help = "Which set of notebooks to process - e.g. hourly", required = False, default = False)
argument = parser.parse_args()
if argument.directory:
# If a directory has been specified, we'll just process that one directory now and exit
processNotebooks(argument.directory)
else:
# Only require the schedule module if we're using the internal scheduler
# Install this with pip install schedule
import schedule
# If no directory has been specified, schedule the processing and execute
schedule.every().hour.at(':40').do(processNotebooks, notebookDirectory='hourly')
schedule.every().day.at('13:15').do(processNotebooks, notebookDirectory='daily')
schedule.every().sunday.at('13:15').do(processNotebooks, notebookDirectory='weekly')
# Run the scheduled tasks
while True:
schedule.run_pending()
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment