Skip to content

Instantly share code, notes, and snippets.

@GuyPaddock
Last active March 18, 2019 15:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GuyPaddock/b9b27ad8764dce1b9b049a3a9558f736 to your computer and use it in GitHub Desktop.
Save GuyPaddock/b9b27ad8764dce1b9b049a3a9558f736 to your computer and use it in GitHub Desktop.
Split a ZIP archive for Drupal Feeds Fetcher Archive + CSV into separate archives < 100 MB each (needed for Pantheon)
#!/usr/bin/env bash
##
# @file
# Asset Zip Splitter
#
# Splits a ZIP archive containing the following file structure into separate
# archives that contain no more than 100 MB each:
# - *.csv (CSV files that reference image files)
# - images/ (a folder of images referenced by the image files)
#
# This allows the file to be uploaded to a Drupal installation running on
# Pantheon that is using Feeds Fetcher Archive to process the CSV file.
# Custom code in the Drupal installation is required to handle import of the
# images themselves (convert filenames to file references).
#
# Each CSV file MUST contain a column named "Filename" that contains the
# filename of each image asset referenced. When constructing each "bin" (i.e.
# smaller ZIP file), this script automatically includes a copy of each CSV
# file in each ZIP file, but rows that reference files that are not in the bin
# are automatically omitted so that CSV files inside each archive only
# references files actually in the bin.
#
# Usage:
# ```
# ./split_zip.sh <ZIP filename> [max file size]
# ```
#
# If "max file size" is specified (in bytes), then the script will use that
# limit when splitting archives. For example:
# ```
# ./split_zip.sh my.zip 200000000
# ```
# Would split an archive into ZIP files containing no more than 200 MB.
#
# The output from the script is saved in a folder called "split/", with each
# piece named relative to the original archive (e.g. "my-001.zip",
# "my-002.zip", etc).
#
# Copyright 2019 Inveniem. All rights reserved.
#
# @author Guy Elsmore-Paddock (guy@inveniem.com)
# @license GNU General Public License, version 3 or later
#
# Stop on undefined variables
set -u
# Stop on non-zero exit
#
# NOTE: not compatible with `let` :(
set -e
################################################################################
# Constants
################################################################################
##
# The default maximum size, in bytes, of a bin of files (100 MB).
#
# Can be overridden by providing a second parameter to the script.
#
MAX_BIN_SIZE="${2:-100000000}"
##
# Path relative to the extract path that contains the images.
#
RELATIVE_IMAGE_SOURCE_PATH="images"
##
# Path relative to the folder that contains the ZIP file, where the new ZIP
# files will be saved.
#
RELATIVE_TARGET_PATH="split"
################################################################################
# Main Functions
################################################################################
print_usage() {
{
echo "Usage: ${0} <ZIP filename> [max file size]"
echo ""
echo "Split a ZIP file that contains a CSV file and images into multiple "
echo "zip files that each contain no more than 'max file size' bytes of "
echo "content each. If the file size is not specified, defaults to "
echo "100,000,000 (100 MB)"
} >&2
exit 1
}
##
# Unzips the provided file to a temporary folder.
#
# Output global vars:
# - $src_path: The absolute path to the folder that contains the contents of
# the zip file.
#
# @param string $1
# The path to the ZIP file to extract.
#
unzip_source_file() {
local zip_path="${1}"
local extract_path=$(create_tmp_dir)
delete_on_exit "${extract_path}"
echo "Unpacking '${zip_path}'..."
unzip "${zip_path}" -d "${extract_path}"
echo "Done!"
echo ""
# Globals
declare -g src_path="${extract_path}"
}
##
# Remove any existing output folder and then create it.
#
# The output path is relative to the folder that contains the input ZIP file.
#
# Output global vars:
# - $output_path: The absolute path to the folder that should contain output
# files.
#
# @param string $1
# The path to the ZIP file that the script is operating on.
#
prepare_output_path() {
local zip_filename="${1}"
local zip_dir=$(dirname "${zip_filename}")
local zip_basename=$(zip_basename "${zip_filename}")
# Globals
declare -g output_path=$(
realpath "${zip_dir}/${zip_basename}-${RELATIVE_TARGET_PATH}"
)
echo "Clearing and creating output path '${output_path}'..."
rm -rf "${output_path}"
mkdir -p "${output_path}"
echo "Done!"
echo ""
}
##
# Ensure that all CSV files in the source path are properly formed and refer to
# only files that exist.
#
# This is a necessary step because the process of breaking up a large set of
# files and metadata into smaller sets makes it more difficult to detect when
# one or more assets are missing or malformed during import.
#
# Input global vars:
# - $src_path: The absolute path to the folder that contains the contents of
# the zip file.
#
sanity_check_csv_files() {
find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do
sanity_check_csv_file "${csv_filepath}"
done
}
##
# Ensure that the provided CSV file is properly formed and refers to only files
# that exist.
#
# @param string $1
# The path to the CSV file to sanity check.
#
sanity_check_csv_file() {
local csv_filepath="${1}"
echo "Sanity-checking CSV files..."
sanity_check_csv_columns "${csv_filepath}"
sanity_check_csv_filenames "${csv_filepath}"
echo "Done!"
echo ""
}
##
# Ensure that the provided CSV file has a column named "Filename".
#
# Several parts of this script rely on the existence of this column.
#
# @param string $1
# The path to the CSV file to sanity check.
#
sanity_check_csv_columns() {
local csv_filepath="${1}"
local image_filename_col_idx=$(
csv_get_filename_column_index "${csv_filepath}"
)
# Ensure that Filename column is present
if [[ -z "${image_filename_col_idx}" ]]; then
{
echo ""
echo "ERROR: CSV file must contain a 'Filename' column: ${csv_filepath}"
} >&2
pause_and_exit 2
else
echo " - 'Filename' column is present. [success]"
fi
}
##
# Ensure that every asset referenced in the provided CSV file exists.
#
# This is a necessary step because the process of breaking up a large set of
# files and metadata into smaller sets makes it more difficult to detect when
# one or more assets are missing during import.
#
# @param string $1
# The path to the CSV file to sanity check.
#
sanity_check_csv_filenames() {
local csv_filepath="${1}"
local image_filename_col_idx=$(
csv_get_filename_column_index "${csv_filepath}"
)
# Ensure that all image files exist
local current_row_number=2
csv_file_to_psv "${csv_filepath}" | \
tail -n +2 | \
cut --delimiter='|' --fields="${image_filename_col_idx}" | \
while read csv_image_filename; do
if [[ ! -f "${src_path}/${csv_image_filename}" ]]; then
{
echo ""
echo "ERROR: Missing image file."
echo " - CSV file: ${csv_filepath}"
echo " - Image file: ${csv_image_filename}"
echo " - Row: ${current_row_number}"
} >&2
pause_and_exit 2
fi
let current_row_number="${current_row_number} + 1"
done
echo " - All referenced files exist. [success]"
}
##
# Determine the file sizes of all files in the source folder.
#
# Input global vars:
# - $src_path: The absolute path to the folder that contains the contents of
# the zip file.
#
# Output global vars:
# - $image_filenames: An array of filenames, ordered largest to smallest file.
# - $image_sizes: An associative array of filenames => sizes.
#
read_source_file_sizes() {
# Globals
declare -ag image_filenames
declare -Ag image_sizes
echo "Determining sizes of files..."
local image_size_output=$(
find "${src_path}/${RELATIVE_IMAGE_SOURCE_PATH}" \
-type f \
-exec stat \
--format="%n|%s" '{}' ';' | \
sort --key 2 --reverse --field-separator='|'
)
# Convert "<FILENAME>|<SIZE>" to:
# - An array of filenames, ordered largest to smallest file ($image_filenames)
# - An associative array of filenames => sizes ($image_sizes)
while IFS='|' read -r filename size; do
image_filenames+=("${filename}")
image_sizes["${filename}"]="${size}"
done <<< "${image_size_output[@]}"
echo "Done!"
echo ""
}
##
# Copy files from the source path into "bins" of files that each are small
# enough than the maximum size.
#
# Input global vars:
# - $src_path: The absolute path to the folder that contains the contents of
# the zip file.
# - $output_path: The absolute path to the folder that should contain output
# files.
# - $image_filenames: An array of filenames, ordered largest to smallest file.
# - $image_sizes: An associative array of filenames => sizes.
#
# @param string $1
# The path to the ZIP file that the script is operating on.
#
organize_files() {
local zip_filename="${1}"
local zip_basename=$(zip_basename "${zip_filename}")
let sequence_number=1
# Follow a greedy algorithm, taking the largest files that will fit in each
# batch
while [[ "${#image_sizes[@]}" -gt 0 ]]; do
calculate_solution
copy_and_zip_solution_files "${zip_basename}" "${sequence_number}"
let sequence_number="${sequence_number} + 1"
done
echo "Done!"
echo ""
echo "Output can be found in: ${output_path}"
echo ""
pause_and_exit
}
##
# Locate files that are small enough to fit in the current bin.
#
# The total size of the bin is guaranteed to be no larger than the maximum
# total size, but may be smaller if the remaining files do not fit or there
# are no files left.
#
# Input global vars:
# - $image_filenames: An array of filenames, ordered largest to smallest file.
# - $image_sizes: An associative array of filenames => sizes.
#
# Output global vars:
# - $current_solution: An array containing filenames of all files that will fit
# in the current bin.
#
calculate_solution() {
# Globals
declare -ag current_solution
current_solution=()
local current_solution_size=0
# NOTE: We can't loop over the hash keys because they are sorted by hash code,
# not size.
for filename in "${image_filenames[@]}"; do
if [[ -z "${image_sizes[$filename]+_}" ]]; then
continue
fi
local file_size="${image_sizes[$filename]}"
# "|| true" => Don't stop on non-zero exit when doing math with `let` :(
let new_solution_size="${current_solution_size} + ${file_size}" || true
if [[ "${new_solution_size}" -le "${MAX_BIN_SIZE}" ]]; then
current_solution+=("${filename}")
current_solution_size="${new_solution_size}"
unset image_sizes["$filename"]
fi
done
# Check if we have a solution when we expected one.
if [[ "${#image_filenames[@]}" -gt 0 && \
"${#current_solution[@]}" -eq 0 ]]; then
{
echo "ERROR: Cannot create small enough bin -- remaining files may be "
echo "too large."
} >&2
pause_and_exit 3
fi
}
##
# Copy all of the files for the current solution to the folder for the
# corresponding bin of files.
#
# Input global vars:
# - $current_solution: An array containing filenames of all files that will fit
# in the current bin.
#
# @param string $1
# The prefix to use for each ZIP file.
# @param integer $2
# The current sequence number.
#
copy_and_zip_solution_files() {
local zip_prefix="${1}"
local sequence_number="${2}"
local prefixed_sequence_number=$(printf "%03d" "${sequence_number}")
local bin_target_path="${output_path}/${prefixed_sequence_number}"
mkdir -p "${bin_target_path}"
echo "=== Bin #${prefixed_sequence_number} === "
copy_solution_images "${bin_target_path}"
copy_solution_csv_files "${bin_target_path}"
zip_solution_files "${bin_target_path}" "${zip_prefix}" "${prefixed_sequence_number}"
}
##
# Copy all of the image files/assets for the current solution to the folder for
# the corresponding bin of files.
#
# Input global vars:
# - $current_solution: An array containing filenames of all files that will fit
# in the current bin.
#
# @param string $1
# The target path where files are stored for the current solution bin.
#
copy_solution_images() {
local bin_target_path="${1}"
echo "Select and copy images:"
for filename in "${current_solution[@]}"; do
local source_dir=$(dirname "${filename}")
local target_dir=$(
realpath "${bin_target_path}/${RELATIVE_IMAGE_SOURCE_PATH}"
)
mkdir -p "${target_dir}"
cp -v "${filename}" "${target_dir}"
done
echo ""
}
##
# Copy a filtered version of each CSV file to the folder for the corresponding
# bin of files.
#
# Rows in the CSV files that refer to files that are not in the specified bin
# will automatically be filtered out.
#
# Input global vars:
# - $src_path: The absolute path to the folder that contains the contents of
# the zip file.
# - $current_solution: An array containing filenames of all files that will fit
# in the current bin.
#
# @param string $1
# The target path where files are stored for the current solution bin.
#
copy_solution_csv_files() {
local bin_target_path="${1}"
echo "Filter and copy CSV files:"
find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do
local image_filename_col_idx=$(
csv_get_filename_column_index "${csv_filepath}"
)
local csv_filename=$(basename "${csv_filepath}")
local target_csv_filename=$(realpath "${bin_target_path}/${csv_filename}")
echo " - Creating filtered copy of '${csv_filepath}' as '${target_csv_filename}'..."
# Always copy header line
head -n 1 "${csv_filepath}" > "${target_csv_filename}"
# Filter out CSV lines that reference files not in this solution bin.
tail -n +2 "${csv_filepath}" | while read csv_line; do
local csv_image_filename=$(
csv_line_to_psv "${csv_line}" | \
cut --delimiter='|' --fields="${image_filename_col_idx}"
)
local target_file=$(realpath "${bin_target_path}/${csv_image_filename}")
if [[ -f "${target_file}" ]]; then
echo "${csv_line}" >> "${target_csv_filename}"
fi
done
done
echo ""
}
##
# Convert a bin (folder of files) into a ZIP file.
#
# The ZIP file is named after the bin sequence number.
#
# @param string $1
# The target path containing the files to put into the ZIP archive.
# @param string $2
# The ZIP file prefix.
# @param string $3
# The three-digit bin sequence number.
#
zip_solution_files() {
local bin_target_path="${1}"
local zip_prefix="${2}"
local bin_sequence_number="${3}"
local target_zip_filename=$(
realpath "${output_path}/${zip_prefix}-${bin_sequence_number}.zip"
)
echo "Zipping bin as '${target_zip_filename}'..."
cd "${bin_target_path}"
zip -9 -rm "${target_zip_filename}" ./
cd -
rmdir "${bin_target_path}"
echo ""
}
################################################################################
# Utility Functions
################################################################################
##
# Create a temporary directory.
#
# This function should be invoked in a sub-shell so that the directory path can
# be echo-ed into a local variable. For example:
# local tmp_dir=$(create_tmp_dir)
#
# If the directory should be removed when this script exits, be sure to call
# delete_on_exit and pass it the directory name. For example:
# local tmp_dir=$(create_tmp_dir)
# delete_on_exit "${tmp_dir}"
#
# @return string
# The path to the temporary folder.
#
create_tmp_dir() {
local tmpdir=$(mktemp -d '/tmp/split_zip.XXXXXXXXXX')
echo "${tmpdir}"
}
##
# Delete a file or folder when this script exits normally or abnormally.
#
# @param string $1
# The path to the file/folder to delete.
#
delete_on_exit() {
local target="${1}"
add_on_exit "rm -rf -- ${target}"
}
# Credit:
# https://www.linuxjournal.com/content/use-bash-trap-statement-cleanup-temporary-files
declare -a on_exit_items
##
# Queue-up a command to run when this script exits normally or abnormally.
#
# @param string $*
# The command and arguments to queue-up.
#
function add_on_exit() {
set +u
local n=${#on_exit_items[*]}
on_exit_items[$n]="$*"
# Setup trap on the first item added to the list
if [[ $n -eq 0 ]]; then
trap dispatch_on_exit_items INT TERM HUP EXIT
fi
}
##
# Execute commands that were queued-up for when this script exits.
#
function dispatch_on_exit_items() {
set +u
for i in "${on_exit_items[@]}"; do
eval $i
done
}
##
# Determine which column in a CSV file contains filenames.
#
# @param string $1
# The path to the CSV file.
#
csv_get_filename_column_index() {
local csv_filepath="${1}"
csv_file_to_psv "${csv_filepath}" | \
head -n 1 | \
awk -v RS='|' '/Filename/{print NR; exit}'
}
##
# Convert a Comma-Separated Values (CSV) file to a Pipe-Separated Values (PSV)
# file.
#
# Any interior commas (i.e. commas inside quoted field values) are ignored.
#
# This is needed so that it's easier to work with data that contains embedded
# commas and/or quotes. The pipe symbol does not appear often (if at all) in
# data that Inveniem works with.
#
# @param string $1
# The path to the CSV file.
#
csv_file_to_psv() {
local csv_filepath="${1}"
# From:
# https://unix.stackexchange.com/a/450813
sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1' "${csv_filepath}"
}
##
# Convert a single line of a Comma-Separated Values (CSV) file into a
# Pipe-Separated Values (PSV) file.
#
# Any interior commas (i.e. commas inside quoted field values) are ignored.
#
# This is needed so that it's easier to work with data that contains embedded
# commas and/or quotes. The pipe symbol does not appear often (if at all) in
# data that Inveniem works with.
#
# @param string $1
# The path to the CSV file.
#
csv_line_to_psv() {
local csv_line="${1}"
# From:
# https://unix.stackexchange.com/a/450813
echo "${csv_line}" | sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1'
}
##
# Convert a ZIP file path into just the name of the file without file extension.
#
# @param string $1
# The path to the target ZIP file.
#
zip_basename() {
local zip_filename="${1}"
basename "${zip_filename}" '.zip'
}
##
# Pause with the message "Press ENTER to continue" before exiting the program.
#
# This makes it easier for users to invoke this script via a drag-and-drop
# approach instead of having to run it via CLI to see error output, in the event
# that what they gave the program has problems.
#
# @param integer $1 [optional]
# The exit code to return when the script exits. The default is 0 (success).
#
pause_and_exit() {
local exit_code="${1:-0}"
echo ""
read -p "Press ENTER to continue." < /dev/tty
exit "${exit_code}"
}
##
# Shim for `realpath` on systems like OSX.
#
# Leans on PHP's or Perl's implementation instead.
#
command -v realpath >/dev/null 2>&1 || realpath() {
if command -v php >/dev/null 2>&1; then
php -r 'echo realpath($argv[1]);' -- "${1}"
elif command -v perl >/dev/null 2>&1; then
perl -e 'use Cwd "abs_path";print abs_path(shift)' "${1}"
else
{
echo "'realpath' is not supported on this system, and there are no"
echo "alternatives (PHP, Perl) available."
} >&2
pause_and_exit 10
fi
}
################################################################################
# Main Script Body
################################################################################
if [[ $# -lt 1 || $# -gt 2 ]]; then
print_usage
else
zip_filename="${1}"
unzip_source_file "${zip_filename}"
prepare_output_path "${zip_filename}"
sanity_check_csv_files
read_source_file_sizes
organize_files "${zip_filename}"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment