Skip to content

Instantly share code, notes, and snippets.

@rkoopmann
Last active February 28, 2020 16:34
Show Gist options
  • Save rkoopmann/a050076f98ad9b31b6174ee65a8d80f3 to your computer and use it in GitHub Desktop.
Save rkoopmann/a050076f98ad9b31b6174ee65a8d80f3 to your computer and use it in GitHub Desktop.
a file-splitting script
#!/bin/bash
# split-file.sh
# a file-splitting script.
#
# take a delimited file, choose a column suitable for splitting by value, split the file into sub-
# directories (named after cplitting column and split value).
#
# limitations
# - header record cannot contain spaces
# - splitting column cannot contain more than 10 distinct values (case-insensitive)
# - values for splitting column cannot contain spaces
#
# usage (run in directory containing the file-to-be-split):
# bash /path/to/split-file.sh file-to-be-split splitting-column 'delimiter'
FILE=$1
SPLIT=$2
DELIM=$3
# run through the file to determine the number possible
SPLIT_VALUES=$(awk -v COL="${SPLIT}" -F${DELIM} '
# set column names based on first record
NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }
# grab the splitting column
NR != 1 { print toupper($(f[COL])) }
' ${FILE} | sort -u)
# quantify the split and prepare the list of values for later use
SPLIT_VALUE_COUNT=$(echo "${SPLIT_VALUES}" | wc -l | awk '{print $1}')
SPLIT_VALUE_LIST=$(echo "${SPLIT_VALUES}" | tr $'\n' ' ')
SPLIT_VALUE_LIST_COUNT=$(echo "${SPLIT_VALUE_LIST}" | tr ' ' $'\n' | grep -v '^$' | wc -l | awk '{print $1}')
# bail out if there are more than 10 splitted values
if [ ${SPLIT_VALUE_COUNT} -gt 10 ]; then
echo "Whoa. ${SPLIT} has ${SPLIT_VALUE_COUNT} values and that is too rich for my blood."
echo "${SPLIT_VALUES}" | cat -n | tr $'\t' ':'
exit
fi
# bail out if the split column values contain spaces
if [ ${SPLIT_VALUE_COUNT} != ${SPLIT_VALUE_LIST_COUNT} ]; then
echo "Egad. The list of values seems to contain spaces and that is too confusing for me."
echo "${SPLIT_VALUE_LIST// /!}"
exit
fi
# the actual splitting of the file
awk -v COL="${SPLIT}" -F${DELIM} '
# set column names based on first record
NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }
# split into separate files
NR != 1 { print > toupper($(f[COL])) }
' ${FILE}
# add header, move, rename, and cleanup
for SPLIT_VALUE in ${SPLIT_VALUE_LIST}; do
mkdir -p ${SPLIT}/${SPLIT_VALUE}
NEW_FILE=${SPLIT}/${SPLIT_VALUE}/${FILE}
head -n 1 ${FILE} > ${NEW_FILE}
cat ${SPLIT_VALUE} >> ${NEW_FILE}
rm ${SPLIT_VALUE}
done
# provide checks
echo "source file line count"
wc -l ${FILE}
echo
echo "split file(s) line count"
wc -l ${SPLIT}/*/${FILE}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment