Last active
February 28, 2020 16:34
-
-
Save rkoopmann/a050076f98ad9b31b6174ee65a8d80f3 to your computer and use it in GitHub Desktop.
a file-splitting script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# split-file.sh | |
# a file-splitting script. | |
# | |
# take a delimited file, choose a column suitable for splitting by value, split the file into sub- | |
# directories (named after cplitting column and split value). | |
# | |
# limitations | |
# - header record cannot contain spaces | |
# - splitting column cannot contain more than 10 distinct values (case-insensitive) | |
# - values for splitting column cannot contain spaces | |
# | |
# usage (run in directory containing the file-to-be-split): | |
# bash /path/to/split-file.sh file-to-be-split splitting-column 'delimiter' | |
FILE=$1 | |
SPLIT=$2 | |
DELIM=$3 | |
# run through the file to determine the number possible | |
SPLIT_VALUES=$(awk -v COL="${SPLIT}" -F${DELIM} ' | |
# set column names based on first record | |
NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } } | |
# grab the splitting column | |
NR != 1 { print toupper($(f[COL])) } | |
' ${FILE} | sort -u) | |
# quantify the split and prepare the list of values for later use | |
SPLIT_VALUE_COUNT=$(echo "${SPLIT_VALUES}" | wc -l | awk '{print $1}') | |
SPLIT_VALUE_LIST=$(echo "${SPLIT_VALUES}" | tr $'\n' ' ') | |
SPLIT_VALUE_LIST_COUNT=$(echo "${SPLIT_VALUE_LIST}" | tr ' ' $'\n' | grep -v '^$' | wc -l | awk '{print $1}') | |
# bail out if there are more than 10 splitted values | |
if [ ${SPLIT_VALUE_COUNT} -gt 10 ]; then | |
echo "Whoa. ${SPLIT} has ${SPLIT_VALUE_COUNT} values and that is too rich for my blood." | |
echo "${SPLIT_VALUES}" | cat -n | tr $'\t' ':' | |
exit | |
fi | |
# bail out if the split column values contain spaces | |
if [ ${SPLIT_VALUE_COUNT} != ${SPLIT_VALUE_LIST_COUNT} ]; then | |
echo "Egad. The list of values seems to contain spaces and that is too confusing for me." | |
echo "${SPLIT_VALUE_LIST// /!}" | |
exit | |
fi | |
# the actual splitting of the file | |
awk -v COL="${SPLIT}" -F${DELIM} ' | |
# set column names based on first record | |
NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } } | |
# split into separate files | |
NR != 1 { print > toupper($(f[COL])) } | |
' ${FILE} | |
# add header, move, rename, and cleanup | |
for SPLIT_VALUE in ${SPLIT_VALUE_LIST}; do | |
mkdir -p ${SPLIT}/${SPLIT_VALUE} | |
NEW_FILE=${SPLIT}/${SPLIT_VALUE}/${FILE} | |
head -n 1 ${FILE} > ${NEW_FILE} | |
cat ${SPLIT_VALUE} >> ${NEW_FILE} | |
rm ${SPLIT_VALUE} | |
done | |
# provide checks | |
echo "source file line count" | |
wc -l ${FILE} | |
echo | |
echo "split file(s) line count" | |
wc -l ${SPLIT}/*/${FILE} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment