Skip to content

Instantly share code, notes, and snippets.

@markedphillips
Last active June 8, 2019 08:24
Show Gist options
  • Save markedphillips/dfde64d123cfc1188918e03380ebf461 to your computer and use it in GitHub Desktop.
Save markedphillips/dfde64d123cfc1188918e03380ebf461 to your computer and use it in GitHub Desktop.
The following script pulls documents from a few directories, renames them based on last name, first name file keyword, converts the documents to text and prepares them for ingestion to a sqlite3 database where further regex work will be done to extract data from the files. This was based on a file system with /Documents/Smith, Johnson File Close…
#!/bin/bash
ABS_PATH=.\
# Colourise the output
RED='\033[0;31m' # Red
GRE='\033[0;32m' # Green
YEL='\033[1;33m' # Yellow
NCL='\033[0m' # No Color
function file_specification() {
FILE_NAME="$(basename "${entry}")"
DIR="$(dirname "${entry}")"
NAME="${FILE_NAME%.*}"
EXT="${FILE_NAME##*.}"
SIZE="$(du -sh "${entry}" | cut -f1)"
printf "%*s${GRE}%s${NCL}\n" $((indent+4)) '' "${entry}"
printf "%*s\tFile name:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$FILE_NAME"
printf "%*s\tDirectory:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$DIR"
printf "%*s\tName only:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$NAME"
printf "%*s\tExtension:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$EXT"
printf "%*s\tFile size:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$SIZE"
}
function walk() {
local indent="${2:-0}"
printf "\n%*s${RED}%s${NCL}\n\n" "$indent" '' "$1"
# If the entry is a file do some operations
for entry in "$1"/*; do [[ -f "$entry" ]] && file_specification; # done
# If the entry is a directory call walk() == create recursion
for entry in "$1"/*; do [[ -d "$entry" ]] && walk "$entry" $((indent+4)); # done
}
# If the path is empty use the current, otherwise convert relative to absolute; Exec walk()
function view_dir () {
[[ -z "${1}" ]] && ABS_PATH="${PWD}" || cd "${1}" && ABS_PATH="${PWD}"
walk "${ABS_PATH}"
echo
}
# Search for key files and rename them based on "Lastname, Firstname" from the directory path and save
# in the directory called.
TIMESTAMP=`date '+%Y%m%d_%H-%M-%S'`
mkdir ${TIMESTAMP}
mkdir "${TIMESTAMP}/Keyword"
echo "Copying source files"
for f in ./**/**/Keyword*.doc; do
cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.doc"
done
for f in ./**/**/Keyword*.docx; do
cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.docx"
done
echo "Listing contents"
ls -al "${TIMESTAMP}"/Keyword
sleep 1 # Make sure to get a new timestamp
TIMESTAMP_1=`date '+%Y%m%d_%H_-%M-%S'`
mkdir -p ${TIMESTAMP_1}/Originals
mkdir -p ${TIMESTAMP_1}/Word
mkdir -p ${TIMESTAMP_1}/Zip
mkdir -p ${TIMESTAMP_1}/Xml
mkdir -p ${TIMESTAMP_1}/Txt
echo "convert doc to docx"
for f in *.doc; do
textutil -convert docx "${f}"
done
for f in *.docx; do
textutil -convert txt "${f}"
done
cp -v *.doc "${TIMESTAMP_1}/Originals/"
cp -v *.docx "${TIMESTAMP_1}/Word/"
cp -v *.docx "${TIMESTAMP_1}/Xml/"
cp -v *.txt "${TIMESTAMP_1}/Txt"
cd "${TIMESTAMP_1}/Xml/"
for f in *.docx; do
new_file="$(echo "${f}" | grep -o '^.*[^.docx]')"
cp -v "${f}" "${new_file}.zip"
unzip "${f}" -d "${new_file}"
done
mv -v *.zip ../Zip
cd ../../
rm -v *.docx *.doc
# Now we have the converted textfiles, clean and prepare for SQLITE3 insertion
for f in *.txt; do
cat -s "{f}" > "{f}_.txt"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment