GuyPaddock/split_zip.sh

## split_zip.sh
#!/usr/bin/env bash

##
# @file
#   Asset Zip Splitter
#
#   Splits a ZIP archive containing the following file structure into separate
#   archives that contain no more than 100 MB each:
#    - *.csv (CSV files that reference image files)
#    - images/ (a folder of images referenced by the image files)
#
#   This allows the file to be uploaded to a Drupal installation running on
#   Pantheon that is using Feeds Fetcher Archive to process the CSV file.
#   Custom code in the Drupal installation is required to handle import of the
#   images themselves (convert filenames to file references).
#
#   Each CSV file MUST contain a column named "Filename" that contains the
#   filename of each image asset referenced. When constructing each "bin" (i.e.
#   smaller ZIP file), this script automatically includes a copy of each CSV
#   file in each ZIP file, but rows that reference files that are not in the bin
#   are automatically omitted so that CSV files inside each archive only
#   references files actually in the bin.
#
#   Usage:
#   ```
#   ./split_zip.sh <ZIP filename> [max file size]
#   ```
#
#   If "max file size" is specified (in bytes), then the script will use that
#   limit when splitting archives. For example:
#   ```
#   ./split_zip.sh my.zip 200000000
#   ```
#   Would split an archive into ZIP files containing no more than 200 MB.
#
#   The output from the script is saved in a folder called "split/", with each
#   piece named relative to the original archive (e.g. "my-001.zip",
#   "my-002.zip", etc).
#
#   Copyright 2019 Inveniem. All rights reserved.
#
# @author Guy Elsmore-Paddock (guy@inveniem.com)
# @license GNU General Public License, version 3 or later
#

# Stop on undefined variables
set -u

# Stop on non-zero exit
#
# NOTE: not compatible with `let` :(
set -e

################################################################################
# Constants
################################################################################

##
# The default maximum size, in bytes, of a bin of files (100 MB).
#
# Can be overridden by providing a second parameter to the script.
#
MAX_BIN_SIZE="${2:-100000000}"

##
# Path relative to the extract path that contains the images.
#
RELATIVE_IMAGE_SOURCE_PATH="images"

##
# Path relative to the folder that contains the ZIP file, where the new ZIP
# files will be saved.
#
RELATIVE_TARGET_PATH="split"

################################################################################
# Main Functions
################################################################################
print_usage() {
  {
    echo "Usage: ${0} <ZIP filename> [max file size]"
    echo ""
    echo "Split a ZIP file that contains a CSV file and images into multiple "
    echo "zip files that each contain no more than 'max file size' bytes of "
    echo "content each. If the file size is not specified, defaults to "
    echo "100,000,000 (100 MB)"
  } >&2

  exit 1
}

##
# Unzips the provided file to a temporary folder.
#
# Output global vars:
#  - $src_path: The absolute path to the folder that contains the contents of
#    the zip file.
#
# @param  string  $1
#   The path to the ZIP file to extract.
#
unzip_source_file() {
  local zip_path="${1}"

  local extract_path=$(create_tmp_dir)
  delete_on_exit "${extract_path}"

  echo "Unpacking '${zip_path}'..."
  unzip "${zip_path}" -d "${extract_path}"

  echo "Done!"
  echo ""

  # Globals
  declare -g src_path="${extract_path}"
}

##
# Remove any existing output folder and then create it.
#
# The output path is relative to the folder that contains the input ZIP file.
#
# Output global vars:
#  - $output_path: The absolute path to the folder that should contain output
#    files.
#
# @param  string  $1
#   The path to the ZIP file that the script is operating on.
#
prepare_output_path() {
  local zip_filename="${1}"

  local zip_dir=$(dirname "${zip_filename}")
  local zip_basename=$(zip_basename "${zip_filename}")

  # Globals
  declare -g output_path=$(
    realpath "${zip_dir}/${zip_basename}-${RELATIVE_TARGET_PATH}"
  )

  echo "Clearing and creating output path '${output_path}'..."

  rm -rf "${output_path}"
  mkdir -p "${output_path}"

  echo "Done!"
  echo ""
}

##
# Ensure that all CSV files in the source path are properly formed and refer to
# only files that exist.
#
# This is a necessary step because the process of breaking up a large set of
# files and metadata into smaller sets makes it more difficult to detect when
# one or more assets are missing or malformed during import.
#
# Input global vars:
#  - $src_path: The absolute path to the folder that contains the contents of
#    the zip file.
#
sanity_check_csv_files() {
  find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do
    sanity_check_csv_file "${csv_filepath}"
  done
}

##
# Ensure that the provided CSV file is properly formed and refers to only files
# that exist.
#
# @param  string  $1
#   The path to the CSV file to sanity check.
#
sanity_check_csv_file() {
  local csv_filepath="${1}"

  echo "Sanity-checking CSV files..."
  sanity_check_csv_columns "${csv_filepath}"
  sanity_check_csv_filenames "${csv_filepath}"

  echo "Done!"
  echo ""
}

##
# Ensure that the provided CSV file has a column named "Filename".
#
# Several parts of this script rely on the existence of this column.
#
# @param  string  $1
#   The path to the CSV file to sanity check.
#
sanity_check_csv_columns() {
  local csv_filepath="${1}"

  local image_filename_col_idx=$(
    csv_get_filename_column_index "${csv_filepath}"
  )

  # Ensure that Filename column is present
  if [[ -z "${image_filename_col_idx}" ]]; then
    {
      echo ""
      echo "ERROR: CSV file must contain a 'Filename' column: ${csv_filepath}"
    } >&2

    pause_and_exit 2
  else
    echo " - 'Filename' column is present. [success]"
  fi
}

##
# Ensure that every asset referenced in the provided CSV file exists.
#
# This is a necessary step because the process of breaking up a large set of
# files and metadata into smaller sets makes it more difficult to detect when
# one or more assets are missing during import.
#
# @param  string  $1
#   The path to the CSV file to sanity check.
#
sanity_check_csv_filenames() {
  local csv_filepath="${1}"

  local image_filename_col_idx=$(
    csv_get_filename_column_index "${csv_filepath}"
  )

  # Ensure that all image files exist
  local current_row_number=2

  csv_file_to_psv "${csv_filepath}" | \
    tail -n +2 | \
    cut --delimiter='|' --fields="${image_filename_col_idx}" | \
    while read csv_image_filename; do

    if [[ ! -f "${src_path}/${csv_image_filename}" ]]; then
      {
        echo ""
        echo "ERROR: Missing image file."
        echo " - CSV file:   ${csv_filepath}"
        echo " - Image file: ${csv_image_filename}"
        echo " - Row:        ${current_row_number}"
      } >&2

      pause_and_exit 2
    fi

    let current_row_number="${current_row_number} + 1"
  done

  echo " - All referenced files exist. [success]"
}

##
# Determine the file sizes of all files in the source folder.
#
# Input global vars:
#  - $src_path: The absolute path to the folder that contains the contents of
#    the zip file.
#
# Output global vars:
#  - $image_filenames: An array of filenames, ordered largest to smallest file.
#  - $image_sizes: An associative array of filenames => sizes.
#
read_source_file_sizes() {
  # Globals
  declare -ag image_filenames
  declare -Ag image_sizes

  echo "Determining sizes of files..."

  local image_size_output=$(
    find "${src_path}/${RELATIVE_IMAGE_SOURCE_PATH}" \
      -type f \
      -exec stat \
      --format="%n|%s" '{}' ';' | \
        sort --key 2 --reverse --field-separator='|'
  )

  # Convert "<FILENAME>|<SIZE>" to:
  #  - An array of filenames, ordered largest to smallest file ($image_filenames)
  #  - An associative array of filenames => sizes ($image_sizes)
  while IFS='|' read -r filename size; do
    image_filenames+=("${filename}")

    image_sizes["${filename}"]="${size}"
  done <<< "${image_size_output[@]}"

  echo "Done!"
  echo ""
}

##
# Copy files from the source path into "bins" of files that each are small
# enough than the maximum size.
#
# Input global vars:
#  - $src_path: The absolute path to the folder that contains the contents of
#    the zip file.
#  - $output_path: The absolute path to the folder that should contain output
#    files.
#  - $image_filenames: An array of filenames, ordered largest to smallest file.
#  - $image_sizes: An associative array of filenames => sizes.
#
# @param  string  $1
#   The path to the ZIP file that the script is operating on.
#
organize_files() {
  local zip_filename="${1}"
  local zip_basename=$(zip_basename "${zip_filename}")

  let sequence_number=1

  # Follow a greedy algorithm, taking the largest files that will fit in each
  # batch
  while [[ "${#image_sizes[@]}" -gt 0 ]]; do
    calculate_solution
    copy_and_zip_solution_files "${zip_basename}" "${sequence_number}"

    let sequence_number="${sequence_number} + 1"
  done

  echo "Done!"
  echo ""
  echo "Output can be found in: ${output_path}"
  echo ""

  pause_and_exit
}

##
# Locate files that are small enough to fit in the current bin.
#
# The total size of the bin is guaranteed to be no larger than the maximum
# total size, but may be smaller if the remaining files do not fit or there
# are no files left.
#
# Input global vars:
#  - $image_filenames: An array of filenames, ordered largest to smallest file.
#  - $image_sizes: An associative array of filenames => sizes.
#
# Output global vars:
#  - $current_solution: An array containing filenames of all files that will fit
#    in the current bin.
#
calculate_solution() {
  # Globals
  declare -ag current_solution

  current_solution=()

  local current_solution_size=0

  # NOTE: We can't loop over the hash keys because they are sorted by hash code,
  # not size.
  for filename in "${image_filenames[@]}"; do
    if [[ -z "${image_sizes[$filename]+_}" ]]; then
      continue
    fi

    local file_size="${image_sizes[$filename]}"

    # "|| true" => Don't stop on non-zero exit when doing math with `let` :(
    let new_solution_size="${current_solution_size} + ${file_size}" || true

    if [[ "${new_solution_size}" -le "${MAX_BIN_SIZE}" ]]; then
      current_solution+=("${filename}")
      current_solution_size="${new_solution_size}"

      unset image_sizes["$filename"]
    fi
  done

  # Check if we have a solution when we expected one.
  if [[ "${#image_filenames[@]}" -gt 0 && \
        "${#current_solution[@]}" -eq 0 ]]; then
    {
      echo "ERROR: Cannot create small enough bin -- remaining files may be "
      echo "too large."
    } >&2
    pause_and_exit 3
  fi
}

##
# Copy all of the files for the current solution to the folder for the
# corresponding bin of files.
#
# Input global vars:
#  - $current_solution: An array containing filenames of all files that will fit
#    in the current bin.
#
# @param  string  $1
#   The prefix to use for each ZIP file.
# @param  integer $2
#   The current sequence number.
#
copy_and_zip_solution_files() {
  local zip_prefix="${1}"
  local sequence_number="${2}"

  local prefixed_sequence_number=$(printf "%03d" "${sequence_number}")
  local bin_target_path="${output_path}/${prefixed_sequence_number}"

  mkdir -p "${bin_target_path}"

  echo "=== Bin #${prefixed_sequence_number} === "
  copy_solution_images "${bin_target_path}"
  copy_solution_csv_files "${bin_target_path}"
  zip_solution_files "${bin_target_path}" "${zip_prefix}" "${prefixed_sequence_number}"
}

##
# Copy all of the image files/assets for the current solution to the folder for
# the corresponding bin of files.
#
# Input global vars:
#  - $current_solution: An array containing filenames of all files that will fit
#    in the current bin.
#
# @param  string  $1
#   The target path where files are stored for the current solution bin.
#
copy_solution_images() {
  local bin_target_path="${1}"

  echo "Select and copy images:"

  for filename in "${current_solution[@]}"; do
    local source_dir=$(dirname "${filename}")

    local target_dir=$(
      realpath "${bin_target_path}/${RELATIVE_IMAGE_SOURCE_PATH}"
    )

    mkdir -p "${target_dir}"

    cp -v "${filename}" "${target_dir}"
  done

  echo ""
}

##
# Copy a filtered version of each CSV file to the folder for the corresponding
# bin of files.
#
# Rows in the CSV files that refer to files that are not in the specified bin
# will automatically be filtered out.
#
# Input global vars:
#  - $src_path: The absolute path to the folder that contains the contents of
#    the zip file.
#  - $current_solution: An array containing filenames of all files that will fit
#    in the current bin.
#
# @param  string  $1
#   The target path where files are stored for the current solution bin.
#
copy_solution_csv_files() {
  local bin_target_path="${1}"

  echo "Filter and copy CSV files:"

  find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do
    local image_filename_col_idx=$(
      csv_get_filename_column_index "${csv_filepath}"
    )

    local csv_filename=$(basename "${csv_filepath}")
    local target_csv_filename=$(realpath "${bin_target_path}/${csv_filename}")

    echo " - Creating filtered copy of '${csv_filepath}' as '${target_csv_filename}'..."

    # Always copy header line
    head -n 1 "${csv_filepath}" > "${target_csv_filename}"

    # Filter out CSV lines that reference files not in this solution bin.
    tail -n +2 "${csv_filepath}" | while read csv_line; do
      local csv_image_filename=$(
        csv_line_to_psv "${csv_line}" | \
          cut --delimiter='|' --fields="${image_filename_col_idx}"
      )

      local target_file=$(realpath "${bin_target_path}/${csv_image_filename}")

      if [[ -f "${target_file}" ]]; then
        echo "${csv_line}" >> "${target_csv_filename}"
      fi
    done
  done

  echo ""
}

##
# Convert a bin (folder of files) into a ZIP file.
#
# The ZIP file is named after the bin sequence number.
#
# @param  string  $1
#   The target path containing the files to put into the ZIP archive.
# @param  string  $2
#   The ZIP file prefix.
# @param  string  $3
#   The three-digit bin sequence number.
#
zip_solution_files() {
  local bin_target_path="${1}"
  local zip_prefix="${2}"
  local bin_sequence_number="${3}"

  local target_zip_filename=$(
    realpath "${output_path}/${zip_prefix}-${bin_sequence_number}.zip"
  )

  echo "Zipping bin as '${target_zip_filename}'..."

  cd "${bin_target_path}"
  zip -9 -rm "${target_zip_filename}" ./
  cd -

  rmdir "${bin_target_path}"

  echo ""
}

################################################################################
# Utility Functions
################################################################################

##
# Create a temporary directory.
#
# This function should be invoked in a sub-shell so that the directory path can
# be echo-ed into a local variable. For example:
#   local tmp_dir=$(create_tmp_dir)
#
# If the directory should be removed when this script exits, be sure to call
# delete_on_exit and pass it the directory name. For example:
#   local tmp_dir=$(create_tmp_dir)
#   delete_on_exit "${tmp_dir}"
#
# @return string
#   The path to the temporary folder.
#
create_tmp_dir() {
  local tmpdir=$(mktemp -d '/tmp/split_zip.XXXXXXXXXX')

  echo "${tmpdir}"
}

##
# Delete a file or folder when this script exits normally or abnormally.
#
# @param  string  $1
#   The path to the file/folder to delete.
#
delete_on_exit() {
  local target="${1}"

  add_on_exit "rm -rf -- ${target}"
}

# Credit:
# https://www.linuxjournal.com/content/use-bash-trap-statement-cleanup-temporary-files
declare -a on_exit_items

##
# Queue-up a command to run when this script exits normally or abnormally.
#
# @param  string  $*
#   The command and arguments to queue-up.
#
function add_on_exit() {
  set +u

  local n=${#on_exit_items[*]}

  on_exit_items[$n]="$*"

  # Setup trap on the first item added to the list
  if [[ $n -eq 0 ]]; then
    trap dispatch_on_exit_items INT TERM HUP EXIT
  fi
}

##
# Execute commands that were queued-up for when this script exits.
#
function dispatch_on_exit_items() {
  set +u

  for i in "${on_exit_items[@]}"; do
    eval $i
  done
}

##
# Determine which column in a CSV file contains filenames.
#
# @param  string  $1
#   The path to the CSV file.
#
csv_get_filename_column_index() {
  local csv_filepath="${1}"

  csv_file_to_psv "${csv_filepath}" | \
    head -n 1 | \
    awk -v RS='|' '/Filename/{print NR; exit}'
}

##
# Convert a Comma-Separated Values (CSV) file to a Pipe-Separated Values (PSV)
# file.
#
# Any interior commas (i.e. commas inside quoted field values) are ignored.
#
# This is needed so that it's easier to work with data that contains embedded
# commas and/or quotes. The pipe symbol does not appear often (if at all) in
# data that Inveniem works with.
#
# @param  string  $1
#   The path to the CSV file.
#
csv_file_to_psv() {
  local csv_filepath="${1}"

  # From:
  # https://unix.stackexchange.com/a/450813
  sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1' "${csv_filepath}"
}

##
# Convert a single line of a Comma-Separated Values (CSV) file into a
# Pipe-Separated Values (PSV) file.
#
# Any interior commas (i.e. commas inside quoted field values) are ignored.
#
# This is needed so that it's easier to work with data that contains embedded
# commas and/or quotes. The pipe symbol does not appear often (if at all) in
# data that Inveniem works with.
#
# @param  string  $1
#   The path to the CSV file.
#
csv_line_to_psv() {
  local csv_line="${1}"

  # From:
  # https://unix.stackexchange.com/a/450813
  echo "${csv_line}" | sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1'
}

##
# Convert a ZIP file path into just the name of the file without file extension.
#
# @param  string $1
#   The path to the target ZIP file.
#
zip_basename() {
  local zip_filename="${1}"

  basename "${zip_filename}" '.zip'
}

##
# Pause with the message "Press ENTER to continue" before exiting the program.
#
# This makes it easier for users to invoke this script via a drag-and-drop
# approach instead of having to run it via CLI to see error output, in the event
# that what they gave the program has problems.
#
# @param  integer $1 [optional]
#   The exit code to return when the script exits. The default is 0 (success).
#
pause_and_exit() {
  local exit_code="${1:-0}"

  echo ""
  read -p "Press ENTER to continue." < /dev/tty
  exit "${exit_code}"
}

##
# Shim for `realpath` on systems like OSX.
#
# Leans on PHP's or Perl's implementation instead.
#
command -v realpath >/dev/null 2>&1 || realpath() {
  if command -v php >/dev/null 2>&1; then
    php -r 'echo realpath($argv[1]);' -- "${1}"
  elif command -v perl >/dev/null 2>&1; then
    perl -e 'use Cwd "abs_path";print abs_path(shift)' "${1}"
  else
    {
      echo "'realpath' is not supported on this system, and there are no"
      echo "alternatives (PHP, Perl) available."
    } >&2

    pause_and_exit 10
  fi
}

################################################################################
# Main Script Body
################################################################################

if [[ $# -lt 1 || $# -gt 2 ]]; then
  print_usage
else
  zip_filename="${1}"

  unzip_source_file "${zip_filename}"
  prepare_output_path "${zip_filename}"
  sanity_check_csv_files

  read_source_file_sizes
  organize_files "${zip_filename}"
fi
	#!/usr/bin/env bash

	##
	# @file
	# Asset Zip Splitter
	#
	# Splits a ZIP archive containing the following file structure into separate
	# archives that contain no more than 100 MB each:
	# - *.csv (CSV files that reference image files)
	# - images/ (a folder of images referenced by the image files)
	#
	# This allows the file to be uploaded to a Drupal installation running on
	# Pantheon that is using Feeds Fetcher Archive to process the CSV file.
	# Custom code in the Drupal installation is required to handle import of the
	# images themselves (convert filenames to file references).
	#
	# Each CSV file MUST contain a column named "Filename" that contains the
	# filename of each image asset referenced. When constructing each "bin" (i.e.
	# smaller ZIP file), this script automatically includes a copy of each CSV
	# file in each ZIP file, but rows that reference files that are not in the bin
	# are automatically omitted so that CSV files inside each archive only
	# references files actually in the bin.
	#
	# Usage:
	# ```
	# ./split_zip.sh <ZIP filename> [max file size]
	# ```
	#
	# If "max file size" is specified (in bytes), then the script will use that
	# limit when splitting archives. For example:
	# ```
	# ./split_zip.sh my.zip 200000000
	# ```
	# Would split an archive into ZIP files containing no more than 200 MB.
	#
	# The output from the script is saved in a folder called "split/", with each
	# piece named relative to the original archive (e.g. "my-001.zip",
	# "my-002.zip", etc).
	#
	# Copyright 2019 Inveniem. All rights reserved.
	#
	# @author Guy Elsmore-Paddock (guy@inveniem.com)
	# @license GNU General Public License, version 3 or later
	#

	# Stop on undefined variables
	set -u

	# Stop on non-zero exit
	#
	# NOTE: not compatible with `let` :(
	set -e

	################################################################################
	# Constants
	################################################################################

	##
	# The default maximum size, in bytes, of a bin of files (100 MB).
	#
	# Can be overridden by providing a second parameter to the script.
	#
	MAX_BIN_SIZE="${2:-100000000}"

	##
	# Path relative to the extract path that contains the images.
	#
	RELATIVE_IMAGE_SOURCE_PATH="images"

	##
	# Path relative to the folder that contains the ZIP file, where the new ZIP
	# files will be saved.
	#
	RELATIVE_TARGET_PATH="split"

	################################################################################
	# Main Functions
	################################################################################
	print_usage() {
	{
	echo "Usage: ${0} <ZIP filename> [max file size]"
	echo ""
	echo "Split a ZIP file that contains a CSV file and images into multiple "
	echo "zip files that each contain no more than 'max file size' bytes of "
	echo "content each. If the file size is not specified, defaults to "
	echo "100,000,000 (100 MB)"
	} >&2

	exit 1
	}

	##
	# Unzips the provided file to a temporary folder.
	#
	# Output global vars:
	# - $src_path: The absolute path to the folder that contains the contents of
	# the zip file.
	#
	# @param string $1
	# The path to the ZIP file to extract.
	#
	unzip_source_file() {
	local zip_path="${1}"

	local extract_path=$(create_tmp_dir)
	delete_on_exit "${extract_path}"

	echo "Unpacking '${zip_path}'..."
	unzip "${zip_path}" -d "${extract_path}"

	echo "Done!"
	echo ""

	# Globals
	declare -g src_path="${extract_path}"
	}

	##
	# Remove any existing output folder and then create it.
	#
	# The output path is relative to the folder that contains the input ZIP file.
	#
	# Output global vars:
	# - $output_path: The absolute path to the folder that should contain output
	# files.
	#
	# @param string $1
	# The path to the ZIP file that the script is operating on.
	#
	prepare_output_path() {
	local zip_filename="${1}"

	local zip_dir=$(dirname "${zip_filename}")
	local zip_basename=$(zip_basename "${zip_filename}")

	# Globals
	declare -g output_path=$(
	realpath "${zip_dir}/${zip_basename}-${RELATIVE_TARGET_PATH}"
	)

	echo "Clearing and creating output path '${output_path}'..."

	rm -rf "${output_path}"
	mkdir -p "${output_path}"

	echo "Done!"
	echo ""
	}

	##
	# Ensure that all CSV files in the source path are properly formed and refer to
	# only files that exist.
	#
	# This is a necessary step because the process of breaking up a large set of
	# files and metadata into smaller sets makes it more difficult to detect when
	# one or more assets are missing or malformed during import.
	#
	# Input global vars:
	# - $src_path: The absolute path to the folder that contains the contents of
	# the zip file.
	#
	sanity_check_csv_files() {
	find "${src_path}" -maxdepth 1 -name '*.csv' \| while read csv_filepath; do
	sanity_check_csv_file "${csv_filepath}"
	done
	}

	##
	# Ensure that the provided CSV file is properly formed and refers to only files
	# that exist.
	#
	# @param string $1
	# The path to the CSV file to sanity check.
	#
	sanity_check_csv_file() {
	local csv_filepath="${1}"

	echo "Sanity-checking CSV files..."
	sanity_check_csv_columns "${csv_filepath}"
	sanity_check_csv_filenames "${csv_filepath}"

	echo "Done!"
	echo ""
	}

	##
	# Ensure that the provided CSV file has a column named "Filename".
	#
	# Several parts of this script rely on the existence of this column.
	#
	# @param string $1
	# The path to the CSV file to sanity check.
	#
	sanity_check_csv_columns() {
	local csv_filepath="${1}"

	local image_filename_col_idx=$(
	csv_get_filename_column_index "${csv_filepath}"
	)

	# Ensure that Filename column is present
	if [[ -z "${image_filename_col_idx}" ]]; then
	{
	echo ""
	echo "ERROR: CSV file must contain a 'Filename' column: ${csv_filepath}"
	} >&2

	pause_and_exit 2
	else
	echo " - 'Filename' column is present. [success]"
	fi
	}

	##
	# Ensure that every asset referenced in the provided CSV file exists.
	#
	# This is a necessary step because the process of breaking up a large set of
	# files and metadata into smaller sets makes it more difficult to detect when
	# one or more assets are missing during import.
	#
	# @param string $1
	# The path to the CSV file to sanity check.
	#
	sanity_check_csv_filenames() {
	local csv_filepath="${1}"

	local image_filename_col_idx=$(
	csv_get_filename_column_index "${csv_filepath}"
	)

	# Ensure that all image files exist
	local current_row_number=2

	csv_file_to_psv "${csv_filepath}" \| \
	tail -n +2 \| \
	cut --delimiter='\|' --fields="${image_filename_col_idx}" \| \
	while read csv_image_filename; do

	if [[ ! -f "${src_path}/${csv_image_filename}" ]]; then
	{
	echo ""
	echo "ERROR: Missing image file."
	echo " - CSV file: ${csv_filepath}"
	echo " - Image file: ${csv_image_filename}"
	echo " - Row: ${current_row_number}"
	} >&2

	pause_and_exit 2
	fi

	let current_row_number="${current_row_number} + 1"
	done

	echo " - All referenced files exist. [success]"
	}

	##
	# Determine the file sizes of all files in the source folder.
	#
	# Input global vars:
	# - $src_path: The absolute path to the folder that contains the contents of
	# the zip file.
	#
	# Output global vars:
	# - $image_filenames: An array of filenames, ordered largest to smallest file.
	# - $image_sizes: An associative array of filenames => sizes.
	#
	read_source_file_sizes() {
	# Globals
	declare -ag image_filenames
	declare -Ag image_sizes

	echo "Determining sizes of files..."

	local image_size_output=$(
	find "${src_path}/${RELATIVE_IMAGE_SOURCE_PATH}" \
	-type f \
	-exec stat \
	--format="%n\|%s" '{}' ';' \| \
	sort --key 2 --reverse --field-separator='\|'
	)

	# Convert "<FILENAME>\|<SIZE>" to:
	# - An array of filenames, ordered largest to smallest file ($image_filenames)
	# - An associative array of filenames => sizes ($image_sizes)
	while IFS='\|' read -r filename size; do
	image_filenames+=("${filename}")

	image_sizes["${filename}"]="${size}"
	done <<< "${image_size_output[@]}"

	echo "Done!"
	echo ""
	}

	##
	# Copy files from the source path into "bins" of files that each are small
	# enough than the maximum size.
	#
	# Input global vars:
	# - $src_path: The absolute path to the folder that contains the contents of
	# the zip file.
	# - $output_path: The absolute path to the folder that should contain output
	# files.
	# - $image_filenames: An array of filenames, ordered largest to smallest file.
	# - $image_sizes: An associative array of filenames => sizes.
	#
	# @param string $1
	# The path to the ZIP file that the script is operating on.
	#
	organize_files() {
	local zip_filename="${1}"
	local zip_basename=$(zip_basename "${zip_filename}")

	let sequence_number=1

	# Follow a greedy algorithm, taking the largest files that will fit in each
	# batch
	while [[ "${#image_sizes[@]}" -gt 0 ]]; do
	calculate_solution
	copy_and_zip_solution_files "${zip_basename}" "${sequence_number}"

	let sequence_number="${sequence_number} + 1"
	done

	echo "Done!"
	echo ""
	echo "Output can be found in: ${output_path}"
	echo ""

	pause_and_exit
	}

	##
	# Locate files that are small enough to fit in the current bin.
	#
	# The total size of the bin is guaranteed to be no larger than the maximum
	# total size, but may be smaller if the remaining files do not fit or there
	# are no files left.
	#
	# Input global vars:
	# - $image_filenames: An array of filenames, ordered largest to smallest file.
	# - $image_sizes: An associative array of filenames => sizes.
	#
	# Output global vars:
	# - $current_solution: An array containing filenames of all files that will fit
	# in the current bin.
	#
	calculate_solution() {
	# Globals
	declare -ag current_solution

	current_solution=()

	local current_solution_size=0

	# NOTE: We can't loop over the hash keys because they are sorted by hash code,
	# not size.
	for filename in "${image_filenames[@]}"; do
	if [[ -z "${image_sizes[$filename]+_}" ]]; then
	continue
	fi

	local file_size="${image_sizes[$filename]}"

	# "\|\| true" => Don't stop on non-zero exit when doing math with `let` :(
	let new_solution_size="${current_solution_size} + ${file_size}" \|\| true

	if [[ "${new_solution_size}" -le "${MAX_BIN_SIZE}" ]]; then
	current_solution+=("${filename}")
	current_solution_size="${new_solution_size}"

	unset image_sizes["$filename"]
	fi
	done

	# Check if we have a solution when we expected one.
	if [[ "${#image_filenames[@]}" -gt 0 && \
	"${#current_solution[@]}" -eq 0 ]]; then
	{
	echo "ERROR: Cannot create small enough bin -- remaining files may be "
	echo "too large."
	} >&2
	pause_and_exit 3
	fi
	}

	##
	# Copy all of the files for the current solution to the folder for the
	# corresponding bin of files.
	#
	# Input global vars:
	# - $current_solution: An array containing filenames of all files that will fit
	# in the current bin.
	#
	# @param string $1
	# The prefix to use for each ZIP file.
	# @param integer $2
	# The current sequence number.
	#
	copy_and_zip_solution_files() {
	local zip_prefix="${1}"
	local sequence_number="${2}"

	local prefixed_sequence_number=$(printf "%03d" "${sequence_number}")
	local bin_target_path="${output_path}/${prefixed_sequence_number}"

	mkdir -p "${bin_target_path}"

	echo "=== Bin #${prefixed_sequence_number} === "
	copy_solution_images "${bin_target_path}"
	copy_solution_csv_files "${bin_target_path}"
	zip_solution_files "${bin_target_path}" "${zip_prefix}" "${prefixed_sequence_number}"
	}

	##
	# Copy all of the image files/assets for the current solution to the folder for
	# the corresponding bin of files.
	#
	# Input global vars:
	# - $current_solution: An array containing filenames of all files that will fit
	# in the current bin.
	#
	# @param string $1
	# The target path where files are stored for the current solution bin.
	#
	copy_solution_images() {
	local bin_target_path="${1}"

	echo "Select and copy images:"

	for filename in "${current_solution[@]}"; do
	local source_dir=$(dirname "${filename}")

	local target_dir=$(
	realpath "${bin_target_path}/${RELATIVE_IMAGE_SOURCE_PATH}"
	)

	mkdir -p "${target_dir}"

	cp -v "${filename}" "${target_dir}"
	done

	echo ""
	}

	##
	# Copy a filtered version of each CSV file to the folder for the corresponding
	# bin of files.
	#
	# Rows in the CSV files that refer to files that are not in the specified bin
	# will automatically be filtered out.
	#
	# Input global vars:
	# - $src_path: The absolute path to the folder that contains the contents of
	# the zip file.
	# - $current_solution: An array containing filenames of all files that will fit
	# in the current bin.
	#
	# @param string $1
	# The target path where files are stored for the current solution bin.
	#
	copy_solution_csv_files() {
	local bin_target_path="${1}"

	echo "Filter and copy CSV files:"

	find "${src_path}" -maxdepth 1 -name '*.csv' \| while read csv_filepath; do
	local image_filename_col_idx=$(
	csv_get_filename_column_index "${csv_filepath}"
	)

	local csv_filename=$(basename "${csv_filepath}")
	local target_csv_filename=$(realpath "${bin_target_path}/${csv_filename}")

	echo " - Creating filtered copy of '${csv_filepath}' as '${target_csv_filename}'..."

	# Always copy header line
	head -n 1 "${csv_filepath}" > "${target_csv_filename}"

	# Filter out CSV lines that reference files not in this solution bin.
	tail -n +2 "${csv_filepath}" \| while read csv_line; do
	local csv_image_filename=$(
	csv_line_to_psv "${csv_line}" \| \
	cut --delimiter='\|' --fields="${image_filename_col_idx}"
	)

	local target_file=$(realpath "${bin_target_path}/${csv_image_filename}")

	if [[ -f "${target_file}" ]]; then
	echo "${csv_line}" >> "${target_csv_filename}"
	fi
	done
	done

	echo ""
	}

	##
	# Convert a bin (folder of files) into a ZIP file.
	#
	# The ZIP file is named after the bin sequence number.
	#
	# @param string $1
	# The target path containing the files to put into the ZIP archive.
	# @param string $2
	# The ZIP file prefix.
	# @param string $3
	# The three-digit bin sequence number.
	#
	zip_solution_files() {
	local bin_target_path="${1}"
	local zip_prefix="${2}"
	local bin_sequence_number="${3}"

	local target_zip_filename=$(
	realpath "${output_path}/${zip_prefix}-${bin_sequence_number}.zip"
	)

	echo "Zipping bin as '${target_zip_filename}'..."

	cd "${bin_target_path}"
	zip -9 -rm "${target_zip_filename}" ./
	cd -

	rmdir "${bin_target_path}"

	echo ""
	}

	################################################################################
	# Utility Functions
	################################################################################

	##
	# Create a temporary directory.
	#
	# This function should be invoked in a sub-shell so that the directory path can
	# be echo-ed into a local variable. For example:
	# local tmp_dir=$(create_tmp_dir)
	#
	# If the directory should be removed when this script exits, be sure to call
	# delete_on_exit and pass it the directory name. For example:
	# local tmp_dir=$(create_tmp_dir)
	# delete_on_exit "${tmp_dir}"
	#
	# @return string
	# The path to the temporary folder.
	#
	create_tmp_dir() {
	local tmpdir=$(mktemp -d '/tmp/split_zip.XXXXXXXXXX')

	echo "${tmpdir}"
	}

	##
	# Delete a file or folder when this script exits normally or abnormally.
	#
	# @param string $1
	# The path to the file/folder to delete.
	#
	delete_on_exit() {
	local target="${1}"

	add_on_exit "rm -rf -- ${target}"
	}

	# Credit:
	# https://www.linuxjournal.com/content/use-bash-trap-statement-cleanup-temporary-files
	declare -a on_exit_items

	##
	# Queue-up a command to run when this script exits normally or abnormally.
	#
	# @param string $*
	# The command and arguments to queue-up.
	#
	function add_on_exit() {
	set +u

	local n=${#on_exit_items[*]}

	on_exit_items[$n]="$*"

	# Setup trap on the first item added to the list
	if [[ $n -eq 0 ]]; then
	trap dispatch_on_exit_items INT TERM HUP EXIT
	fi
	}

	##
	# Execute commands that were queued-up for when this script exits.
	#
	function dispatch_on_exit_items() {
	set +u

	for i in "${on_exit_items[@]}"; do
	eval $i
	done
	}

	##
	# Determine which column in a CSV file contains filenames.
	#
	# @param string $1
	# The path to the CSV file.
	#
	csv_get_filename_column_index() {
	local csv_filepath="${1}"

	csv_file_to_psv "${csv_filepath}" \| \
	head -n 1 \| \
	awk -v RS='\|' '/Filename/{print NR; exit}'
	}

	##
	# Convert a Comma-Separated Values (CSV) file to a Pipe-Separated Values (PSV)
	# file.
	#
	# Any interior commas (i.e. commas inside quoted field values) are ignored.
	#
	# This is needed so that it's easier to work with data that contains embedded
	# commas and/or quotes. The pipe symbol does not appear often (if at all) in
	# data that Inveniem works with.
	#
	# @param string $1
	# The path to the CSV file.
	#
	csv_file_to_psv() {
	local csv_filepath="${1}"

	# From:
	# https://unix.stackexchange.com/a/450813
	sed -Ee :1 -e 's/^(([^",]\|"[^"]")),/\1\|/;t1' "${csv_filepath}"
	}

	##
	# Convert a single line of a Comma-Separated Values (CSV) file into a
	# Pipe-Separated Values (PSV) file.
	#
	# Any interior commas (i.e. commas inside quoted field values) are ignored.
	#
	# This is needed so that it's easier to work with data that contains embedded
	# commas and/or quotes. The pipe symbol does not appear often (if at all) in
	# data that Inveniem works with.
	#
	# @param string $1
	# The path to the CSV file.
	#
	csv_line_to_psv() {
	local csv_line="${1}"

	# From:
	# https://unix.stackexchange.com/a/450813
	echo "${csv_line}" \| sed -Ee :1 -e 's/^(([^",]\|"[^"]")),/\1\|/;t1'
	}

	##
	# Convert a ZIP file path into just the name of the file without file extension.
	#
	# @param string $1
	# The path to the target ZIP file.
	#
	zip_basename() {
	local zip_filename="${1}"

	basename "${zip_filename}" '.zip'
	}

	##
	# Pause with the message "Press ENTER to continue" before exiting the program.
	#
	# This makes it easier for users to invoke this script via a drag-and-drop
	# approach instead of having to run it via CLI to see error output, in the event
	# that what they gave the program has problems.
	#
	# @param integer $1 [optional]
	# The exit code to return when the script exits. The default is 0 (success).
	#
	pause_and_exit() {
	local exit_code="${1:-0}"

	echo ""
	read -p "Press ENTER to continue." < /dev/tty
	exit "${exit_code}"
	}

	##
	# Shim for `realpath` on systems like OSX.
	#
	# Leans on PHP's or Perl's implementation instead.
	#
	command -v realpath >/dev/null 2>&1 \|\| realpath() {
	if command -v php >/dev/null 2>&1; then
	php -r 'echo realpath($argv[1]);' -- "${1}"
	elif command -v perl >/dev/null 2>&1; then
	perl -e 'use Cwd "abs_path";print abs_path(shift)' "${1}"
	else
	{
	echo "'realpath' is not supported on this system, and there are no"
	echo "alternatives (PHP, Perl) available."
	} >&2

	pause_and_exit 10
	fi
	}

	################################################################################
	# Main Script Body
	################################################################################

	if [[ $# -lt 1 \|\| $# -gt 2 ]]; then
	print_usage
	else
	zip_filename="${1}"

	unzip_source_file "${zip_filename}"
	prepare_output_path "${zip_filename}"
	sanity_check_csv_files

	read_source_file_sizes
	organize_files "${zip_filename}"
	fi