markedphillips/prep_docs_ingest.sh

## prep_docs_ingest.sh
#!/bin/bash

ABS_PATH=.\

# Colourise the output
RED='\033[0;31m'        # Red
GRE='\033[0;32m'        # Green
YEL='\033[1;33m'        # Yellow
NCL='\033[0m'           # No Color

function file_specification() {
  FILE_NAME="$(basename "${entry}")"
  DIR="$(dirname "${entry}")"
  NAME="${FILE_NAME%.*}"
  EXT="${FILE_NAME##*.}"
  SIZE="$(du -sh "${entry}" | cut -f1)"

  printf "%*s${GRE}%s${NCL}\n"                    $((indent+4)) '' "${entry}"
  printf "%*s\tFile name:\t${YEL}%s${NCL}\n"      $((indent+4)) '' "$FILE_NAME"
  printf "%*s\tDirectory:\t${YEL}%s${NCL}\n"      $((indent+4)) '' "$DIR"
  printf "%*s\tName only:\t${YEL}%s${NCL}\n"      $((indent+4)) '' "$NAME"
  printf "%*s\tExtension:\t${YEL}%s${NCL}\n"      $((indent+4)) '' "$EXT"
  printf "%*s\tFile size:\t${YEL}%s${NCL}\n"      $((indent+4)) '' "$SIZE"
}

function walk() {
  local indent="${2:-0}"
  printf "\n%*s${RED}%s${NCL}\n\n" "$indent" '' "$1"
  # If the entry is a file do some operations
  for entry in "$1"/*; do [[ -f "$entry" ]] && file_specification; # done
  # If the entry is a directory call walk() == create recursion
  for entry in "$1"/*; do [[ -d "$entry" ]] && walk "$entry" $((indent+4)); # done
}

# If the path is empty use the current, otherwise convert relative to absolute; Exec walk()
function view_dir () {
  [[ -z "${1}" ]] && ABS_PATH="${PWD}" || cd "${1}" && ABS_PATH="${PWD}"
  walk "${ABS_PATH}"
  echo
}

# Search for key files and rename them based on "Lastname, Firstname" from the directory path and save
# in the directory called.

TIMESTAMP=`date '+%Y%m%d_%H-%M-%S'`
mkdir ${TIMESTAMP}
mkdir "${TIMESTAMP}/Keyword"

echo "Copying source files"
for f in ./**/**/Keyword*.doc; do
  cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.doc"
done
for f in ./**/**/Keyword*.docx; do
  cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.docx"
done

echo "Listing contents"
ls -al "${TIMESTAMP}"/Keyword

sleep 1 # Make sure to get a new timestamp

TIMESTAMP_1=`date '+%Y%m%d_%H_-%M-%S'`
mkdir -p ${TIMESTAMP_1}/Originals
mkdir -p ${TIMESTAMP_1}/Word
mkdir -p ${TIMESTAMP_1}/Zip
mkdir -p ${TIMESTAMP_1}/Xml
mkdir -p ${TIMESTAMP_1}/Txt

echo "convert doc to docx"
for f in *.doc; do
  textutil -convert docx "${f}"
done
for f in *.docx; do
  textutil -convert txt  "${f}"
done

cp -v *.doc "${TIMESTAMP_1}/Originals/"
cp -v *.docx "${TIMESTAMP_1}/Word/"
cp -v *.docx "${TIMESTAMP_1}/Xml/"
cp -v *.txt "${TIMESTAMP_1}/Txt"

cd "${TIMESTAMP_1}/Xml/"
for f in *.docx; do
  new_file="$(echo "${f}" | grep -o '^.*[^.docx]')"
  cp -v "${f}" "${new_file}.zip"
  unzip "${f}" -d "${new_file}"
done
mv -v *.zip ../Zip
cd ../../

rm -v *.docx *.doc

# Now we have the converted textfiles, clean and prepare for SQLITE3 insertion
for f in *.txt; do
  cat -s "{f}" > "{f}_.txt"
done
	#!/bin/bash

	ABS_PATH=.\

	# Colourise the output
	RED='\033[0;31m' # Red
	GRE='\033[0;32m' # Green
	YEL='\033[1;33m' # Yellow
	NCL='\033[0m' # No Color

	function file_specification() {
	FILE_NAME="$(basename "${entry}")"
	DIR="$(dirname "${entry}")"
	NAME="${FILE_NAME%.*}"
	EXT="${FILE_NAME##*.}"
	SIZE="$(du -sh "${entry}" \| cut -f1)"

	printf "%*s${GRE}%s${NCL}\n" $((indent+4)) '' "${entry}"
	printf "%*s\tFile name:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$FILE_NAME"
	printf "%*s\tDirectory:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$DIR"
	printf "%*s\tName only:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$NAME"
	printf "%*s\tExtension:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$EXT"
	printf "%*s\tFile size:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$SIZE"
	}

	function walk() {
	local indent="${2:-0}"
	printf "\n%*s${RED}%s${NCL}\n\n" "$indent" '' "$1"
	# If the entry is a file do some operations
	for entry in "$1"/*; do [[ -f "$entry" ]] && file_specification; # done
	# If the entry is a directory call walk() == create recursion
	for entry in "$1"/*; do [[ -d "$entry" ]] && walk "$entry" $((indent+4)); # done
	}

	# If the path is empty use the current, otherwise convert relative to absolute; Exec walk()
	function view_dir () {
	[[ -z "${1}" ]] && ABS_PATH="${PWD}" \|\| cd "${1}" && ABS_PATH="${PWD}"
	walk "${ABS_PATH}"
	echo
	}

	# Search for key files and rename them based on "Lastname, Firstname" from the directory path and save
	# in the directory called.

	TIMESTAMP=`date '+%Y%m%d_%H-%M-%S'`
	mkdir ${TIMESTAMP}
	mkdir "${TIMESTAMP}/Keyword"

	echo "Copying source files"
	for f in .///Keyword*.doc; do
	cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" \| grep -o '^\.\/[a-zA-Z0-9\-\s],\s[a-zA-Z0-9]')_Keyword.doc"
	done
	for f in .///Keyword*.docx; do
	cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" \| grep -o '^\.\/[a-zA-Z0-9\-\s],\s[a-zA-Z0-9]')_Keyword.docx"
	done

	echo "Listing contents"
	ls -al "${TIMESTAMP}"/Keyword

	sleep 1 # Make sure to get a new timestamp

	TIMESTAMP_1=`date '+%Y%m%d_%H_-%M-%S'`
	mkdir -p ${TIMESTAMP_1}/Originals
	mkdir -p ${TIMESTAMP_1}/Word
	mkdir -p ${TIMESTAMP_1}/Zip
	mkdir -p ${TIMESTAMP_1}/Xml
	mkdir -p ${TIMESTAMP_1}/Txt

	echo "convert doc to docx"
	for f in *.doc; do
	textutil -convert docx "${f}"
	done
	for f in *.docx; do
	textutil -convert txt "${f}"
	done

	cp -v *.doc "${TIMESTAMP_1}/Originals/"
	cp -v *.docx "${TIMESTAMP_1}/Word/"
	cp -v *.docx "${TIMESTAMP_1}/Xml/"
	cp -v *.txt "${TIMESTAMP_1}/Txt"

	cd "${TIMESTAMP_1}/Xml/"
	for f in *.docx; do
	new_file="$(echo "${f}" \| grep -o '^.*[^.docx]')"
	cp -v "${f}" "${new_file}.zip"
	unzip "${f}" -d "${new_file}"
	done
	mv -v *.zip ../Zip
	cd ../../

	rm -v .docx .doc

	# Now we have the converted textfiles, clean and prepare for SQLITE3 insertion
	for f in *.txt; do
	cat -s "{f}" > "{f}_.txt"
	done