rkoopmann/split-file.sh

## split-file.sh
#!/bin/bash

# split-file.sh
#   a file-splitting script.
#
# take a delimited file, choose a column suitable for splitting by value, split the file into sub-
# directories (named after cplitting column and split value).
#
# limitations
#   - header record cannot contain spaces
#   - splitting column cannot contain more than 10 distinct values (case-insensitive)
#   - values for splitting column cannot contain spaces
#
# usage (run in directory containing the file-to-be-split):
#   bash /path/to/split-file.sh file-to-be-split splitting-column 'delimiter'

FILE=$1
SPLIT=$2
DELIM=$3

# run through the file to determine the number possible
SPLIT_VALUES=$(awk -v COL="${SPLIT}" -F${DELIM} '
  # set column names based on first record
  NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }

  # grab the splitting column
  NR != 1 { print toupper($(f[COL])) }
' ${FILE} | sort -u)

# quantify the split and prepare the list of values for later use
SPLIT_VALUE_COUNT=$(echo "${SPLIT_VALUES}" | wc -l | awk '{print $1}')
SPLIT_VALUE_LIST=$(echo "${SPLIT_VALUES}" | tr $'\n' ' ')
SPLIT_VALUE_LIST_COUNT=$(echo "${SPLIT_VALUE_LIST}" | tr ' ' $'\n' | grep -v '^$' | wc -l | awk '{print $1}')

# bail out if there are more than 10 splitted values
if [ ${SPLIT_VALUE_COUNT} -gt 10 ]; then
  echo "Whoa. ${SPLIT} has ${SPLIT_VALUE_COUNT} values and that is too rich for my blood."
  echo "${SPLIT_VALUES}" | cat -n | tr $'\t' ':'
  exit
fi

# bail out if the split column values contain spaces
if [ ${SPLIT_VALUE_COUNT} != ${SPLIT_VALUE_LIST_COUNT} ]; then
  echo "Egad. The list of values seems to contain spaces and that is too confusing for me."
  echo "${SPLIT_VALUE_LIST// /!}"
  exit
fi

# the actual splitting of the file
awk -v COL="${SPLIT}" -F${DELIM} '
  # set column names based on first record
  NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }

  # split into separate files
  NR != 1 { print > toupper($(f[COL])) }
' ${FILE}

# add header, move, rename, and cleanup
for SPLIT_VALUE in ${SPLIT_VALUE_LIST}; do
  mkdir -p ${SPLIT}/${SPLIT_VALUE}
  NEW_FILE=${SPLIT}/${SPLIT_VALUE}/${FILE}
  head -n 1 ${FILE} > ${NEW_FILE}
  cat ${SPLIT_VALUE} >> ${NEW_FILE}
  rm ${SPLIT_VALUE}
done

# provide checks
echo "source file line count"
wc -l ${FILE}
echo
echo "split file(s) line count"
wc -l ${SPLIT}/*/${FILE}
	#!/bin/bash

	# split-file.sh
	# a file-splitting script.
	#
	# take a delimited file, choose a column suitable for splitting by value, split the file into sub-
	# directories (named after cplitting column and split value).
	#
	# limitations
	# - header record cannot contain spaces
	# - splitting column cannot contain more than 10 distinct values (case-insensitive)
	# - values for splitting column cannot contain spaces
	#
	# usage (run in directory containing the file-to-be-split):
	# bash /path/to/split-file.sh file-to-be-split splitting-column 'delimiter'

	FILE=$1
	SPLIT=$2
	DELIM=$3

	# run through the file to determine the number possible
	SPLIT_VALUES=$(awk -v COL="${SPLIT}" -F${DELIM} '
	# set column names based on first record
	NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }

	# grab the splitting column
	NR != 1 { print toupper($(f[COL])) }
	' ${FILE} \| sort -u)

	# quantify the split and prepare the list of values for later use
	SPLIT_VALUE_COUNT=$(echo "${SPLIT_VALUES}" \| wc -l \| awk '{print $1}')
	SPLIT_VALUE_LIST=$(echo "${SPLIT_VALUES}" \| tr $'\n' ' ')
	SPLIT_VALUE_LIST_COUNT=$(echo "${SPLIT_VALUE_LIST}" \| tr ' ' $'\n' \| grep -v '^$' \| wc -l \| awk '{print $1}')

	# bail out if there are more than 10 splitted values
	if [ ${SPLIT_VALUE_COUNT} -gt 10 ]; then
	echo "Whoa. ${SPLIT} has ${SPLIT_VALUE_COUNT} values and that is too rich for my blood."
	echo "${SPLIT_VALUES}" \| cat -n \| tr $'\t' ':'
	exit
	fi

	# bail out if the split column values contain spaces
	if [ ${SPLIT_VALUE_COUNT} != ${SPLIT_VALUE_LIST_COUNT} ]; then
	echo "Egad. The list of values seems to contain spaces and that is too confusing for me."
	echo "${SPLIT_VALUE_LIST// /!}"
	exit
	fi

	# the actual splitting of the file
	awk -v COL="${SPLIT}" -F${DELIM} '
	# set column names based on first record
	NR == 1 { for (i=1; i<=NF; i++) { f[$i] = i } }

	# split into separate files
	NR != 1 { print > toupper($(f[COL])) }
	' ${FILE}

	# add header, move, rename, and cleanup
	for SPLIT_VALUE in ${SPLIT_VALUE_LIST}; do
	mkdir -p ${SPLIT}/${SPLIT_VALUE}
	NEW_FILE=${SPLIT}/${SPLIT_VALUE}/${FILE}
	head -n 1 ${FILE} > ${NEW_FILE}
	cat ${SPLIT_VALUE} >> ${NEW_FILE}
	rm ${SPLIT_VALUE}
	done

	# provide checks
	echo "source file line count"
	wc -l ${FILE}
	echo
	echo "split file(s) line count"
	wc -l ${SPLIT}/*/${FILE}