fozziethebeat/prefect_caching_tasks.py

## prefect_caching_tasks.py
import numpy as np
import pandas as pd
import prefect

from os import listdir
from os.path import isfile, join
from prefect import Flow, apply_map, case, task
from prefect.engine.results import LocalResult
from prefect.tasks.control_flow import merge

INPUT_BASE_PATH = './data/input'
OUTPUT_BASE_PATH = './data/output'
CACHE_DIR = "/home/fozziethebeat/devel/prefect_samples/data/cache"


@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
def emit_filenames():
    '''
    Returns a list of CSV filen names found in the input data directory.
    '''
    csvs = [
        csv_file for csv_file in listdir('./data/input')
        if isfile(join(INPUT_BASE_PATH, csv_file))
    ]
    return csvs


@task(result=LocalResult(dir="./data/cache/.prefect"))
def verify_checkpoint(filename):
    '''
    Reads the input dataframe and corresponding output dataframe, then figures
    out if the checkpoints match.

    This assumes the input csv has the following fields:
      - region
      - metric
      - value
      - checkpoint

    This assumes the output csv has the following fields:
      - region
      - cases
      - deaths
      - ratio
      - checkpoint

    Returns a type of two values:
      - True if the checkpoints match, false otherwise
      - The output dataframe when checkpoints match, otherwise the input
        dataframe
    '''
    input_path = join(INPUT_BASE_PATH, filename)
    output_path = join(OUTPUT_BASE_PATH, filename)

    input_dataframe = pd.read_csv(input_path)
    output_dataframe = pd.read_csv(output_path)

    input_checkpoint = input_dataframe['checkpoint'].min()
    output_checkpoint = output_dataframe['checkpoint'].min()

    if (input_checkpoint == output_checkpoint):
        return (True, output_dataframe)
    return (False, input_dataframe)


@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
def checkpoint_matches(dataframes):
    '''
    Returns the truth value in the tuple
    '''
    return dataframes[0]


@task
def copy_result(dataframes):
    '''
    Returns the dataframe in the tuple
    '''
    return dataframes[1]
@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
def compute_result(dataframe_tuple):
    '''
    Does some fake computation and copies the input checkpoint to output.
    '''
    dataframe = dataframe_tuple[1]
    checkpoint = dataframe['checkpoint'].min()
    result = (dataframe.drop('checkpoint',
                             axis=1).set_index(['region', 'metric'],
                                               drop=True).unstack())
    result.columns = result.columns.droplevel()
    result['checkpoint'] = checkpoint
    return result


@task
def print_dataframe(dataframe):
    '''
    Prints the result
    '''
    print(dataframe)


def compute_or_copy(dataframes):
    '''
    Copies the old output dataframe if checkpoints match or computes a new
    output dataframe if needed.
    '''
    cond = checkpoint_matches(dataframes)
    with case(cond, True):
        copy_df = copy_result(dataframes)
    with case(cond, False):
        compute_df = compute_result(dataframes)
    return merge(copy_df, compute_df)


with Flow('ETL', result=LocalResult()) as flow:
    csv_filenames = emit_filenames()
    dataframes = verify_checkpoint.map(csv_filenames)
    final_dataframes = apply_map(compute_or_copy, dataframes)
    print_dataframe.map(final_dataframes)

    flow.run()
	import numpy as np
	import pandas as pd
	import prefect

	from os import listdir
	from os.path import isfile, join
	from prefect import Flow, apply_map, case, task
	from prefect.engine.results import LocalResult
	from prefect.tasks.control_flow import merge

	INPUT_BASE_PATH = './data/input'
	OUTPUT_BASE_PATH = './data/output'
	CACHE_DIR = "/home/fozziethebeat/devel/prefect_samples/data/cache"


	@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
	def emit_filenames():
	'''
	Returns a list of CSV filen names found in the input data directory.
	'''
	csvs = [
	csv_file for csv_file in listdir('./data/input')
	if isfile(join(INPUT_BASE_PATH, csv_file))
	]
	return csvs


	@task(result=LocalResult(dir="./data/cache/.prefect"))
	def verify_checkpoint(filename):
	'''
	Reads the input dataframe and corresponding output dataframe, then figures
	out if the checkpoints match.

	This assumes the input csv has the following fields:
	- region
	- metric
	- value
	- checkpoint

	This assumes the output csv has the following fields:
	- region
	- cases
	- deaths
	- ratio
	- checkpoint

	Returns a type of two values:
	- True if the checkpoints match, false otherwise
	- The output dataframe when checkpoints match, otherwise the input
	dataframe
	'''
	input_path = join(INPUT_BASE_PATH, filename)
	output_path = join(OUTPUT_BASE_PATH, filename)

	input_dataframe = pd.read_csv(input_path)
	output_dataframe = pd.read_csv(output_path)

	input_checkpoint = input_dataframe['checkpoint'].min()
	output_checkpoint = output_dataframe['checkpoint'].min()

	if (input_checkpoint == output_checkpoint):
	return (True, output_dataframe)
	return (False, input_dataframe)


	@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
	def checkpoint_matches(dataframes):
	'''
	Returns the truth value in the tuple
	'''
	return dataframes[0]


	@task
	def copy_result(dataframes):
	'''
	Returns the dataframe in the tuple
	'''
	return dataframes[1]
	@task(result=LocalResult(dir=CACHE_DIR, validate_dir=True))
	def compute_result(dataframe_tuple):
	'''
	Does some fake computation and copies the input checkpoint to output.
	'''
	dataframe = dataframe_tuple[1]
	checkpoint = dataframe['checkpoint'].min()
	result = (dataframe.drop('checkpoint',
	axis=1).set_index(['region', 'metric'],
	drop=True).unstack())
	result.columns = result.columns.droplevel()
	result['checkpoint'] = checkpoint
	return result


	@task
	def print_dataframe(dataframe):
	'''
	Prints the result
	'''
	print(dataframe)


	def compute_or_copy(dataframes):
	'''
	Copies the old output dataframe if checkpoints match or computes a new
	output dataframe if needed.
	'''
	cond = checkpoint_matches(dataframes)
	with case(cond, True):
	copy_df = copy_result(dataframes)
	with case(cond, False):
	compute_df = compute_result(dataframes)
	return merge(copy_df, compute_df)


	with Flow('ETL', result=LocalResult()) as flow:
	csv_filenames = emit_filenames()
	dataframes = verify_checkpoint.map(csv_filenames)
	final_dataframes = apply_map(compute_or_copy, dataframes)
	print_dataframe.map(final_dataframes)

	flow.run()