verachell/getphrases.sh

## getphrases.sh
#! /bin/bash
# Designed for getting phrases from a Project Gutenberg book.
# Phrases are defined here as anything between commas and between certain other punctuation,
# with the punctuation removed in the process.
# Additional unwanted or outdated punctuation and chapter headings are also removed in this script
# This script was created to work with a particular book, your mileage may vary,
# especially when it comes to chapter heading removals as each book may format these differently.
#
# First word wrapping is removed, then lines are split at certain punctuation marks.
# Next question marks are handled, since we wish to retain the question mark after splitting
# Next '--' punctuation is replaced by a simple space and '_' punctuation removed
# Next, indicators of chapter headings are removed.
# Finally, empty lines are removed (this was not essential for my use case but have included
# it for the sake of completeness). Credit for empty line removal using grep is from
# https://stackoverflow.com/questions/16414410/delete-empty-lines-using-sed

 cat $1 | tr -s '\r\n' ' ' | tr '",;.' '\n' | sed 's/?/?\n/g' |sed 's/--/ /g' | tr -d '_' | grep -Fv '* *' | grep -v \§ |grep  -v '^[[:space:]]*$'
	#! /bin/bash
	# Designed for getting phrases from a Project Gutenberg book.
	# Phrases are defined here as anything between commas and between certain other punctuation,
	# with the punctuation removed in the process.
	# Additional unwanted or outdated punctuation and chapter headings are also removed in this script
	# This script was created to work with a particular book, your mileage may vary,
	# especially when it comes to chapter heading removals as each book may format these differently.
	#
	# First word wrapping is removed, then lines are split at certain punctuation marks.
	# Next question marks are handled, since we wish to retain the question mark after splitting
	# Next '--' punctuation is replaced by a simple space and '_' punctuation removed
	# Next, indicators of chapter headings are removed.
	# Finally, empty lines are removed (this was not essential for my use case but have included
	# it for the sake of completeness). Credit for empty line removal using grep is from
	# https://stackoverflow.com/questions/16414410/delete-empty-lines-using-sed

	cat $1 \| tr -s '\r\n' ' ' \| tr '",;.' '\n' \| sed 's/?/?\n/g' \|sed 's/--/ /g' \| tr -d '_' \| grep -Fv '* ' \| grep -v \§ \|grep -v '^[[:space:]]$'