albert-decatur/A magic_stopword_removal.sh

## A magic_stopword_removal.sh
# bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels
# first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams
# then we build sed commands around these stopwords, using word boundaries (eg \bRd\b)
# and we ask for a case-insensitive match (eg sed 's/pattern//I')
# then we string them together with pipes
# and cat the file of interst and eval all the sed commands!
# this removes the stopword list we build, with case-insensitive match, and using word boundaries
# advantage over sed OR (eg sed 's/Rd\|St//I') is that that can only do some many at a time
# would be more efficient of course to batch these into groups of *n*

rm_stopwords=$(
    n=6
    ngrams tiger.csv 1 |\
    sortfreq |\
    sed '1d' |\
    awk "{if(length(\$2)<= $n || \$2 ~ /^[^aeiou]*$/ )print \$0}"|\
    head -n 35|\
    sed '1 i\count\tstopword' |\
    sed '1d' |\
    tawk '{print $2}'  |\
    sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' |\
    tr '\n' '|' |\
    sed 's:|$::g'
)
cat tiger.csv |\
eval $rm_stopwords

## B 3types_of_stopwords.sh
# make three distinct lists of stopwords out of tiger road fullname unigrams
# NB: each list a subset of the inverse of the previous list
# NB: uses dotfiles for data science
# 1 too short or includes number
# 2 has no vowels
# 3 top _n_ most common whole words (not too short, has vowel, doesn't include number)
# NB: be generous with the whole words list as you will want to curate it by hand at least a little to avoid making meaningful stuff into a stopword

# take this many of the most common whole words
ntop_wholeWords=400
# consider any unigram shorter than this to be a stopword
minlength=4
# this is our temporary unigram list - we will progressively shorten this so that one type of list does not bleed into another type
unigrams=$(mktemp)

# just a temp file to store the street names - should rewrite ngram function to take STDIN actually
tmp=$(mktemp)
# make that list of unigrams
cat fullnames_uniq.txt | c 2  | sed '1d'> $tmp
# get a unique list of unigrams with counts of frequency
ngrams $tmp 1 | sortfreq > $unigrams
# start building our 3 types of stopwords lists
# first up is too short or contains a number
stopwords_shortOrNum=$( cat $unigrams | mawk "{if( \$2 ~ /[0-9]/|| length(\$2)< $minlength )print \$0}" )
# save these stopwords to file
echo "$stopwords_shortOrNum" | c 2 > stop_shortOrNum
# remove these matches from the unique frequency unigram list
cat $unigrams | grep -vFf <( echo "$stopwords_shortOrNum" ) | sponge $unigrams
# next up! make a stopword list when unigrams do not have an ASCII vowel
stopwords_noVowel=$( cat $unigrams | mawk "{if(\$2 ~ /^[^aeiouy]*$/ )print \$0}" )
# write that list to file
echo "$stopwords_noVowel" | c 2 > stop_noVowel
# remove it from the unique unigram freq list
cat $unigrams | grep -vFf <( echo "$stopwords_noVowel" ) | sponge $unigrams
# now get a list of the _n_ top unigrams that are left by frequency
# be generous with the length of this list, as you may have to remove potential stopwords from it by hand
# whole words are gold
cat $unigrams | mawk "{print \$0}" | sed '1d' | head -n $ntop_wholeWords | c 2 > stop_wholeWords

## C eval arbitrary number of commands using GNU parallel
in=/tmp/streets; cp $in /tmp/1; rm_all_stopwords=$( cat stopwords/* | sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' | sed 's:$:\|\\:g' ); echo "$rm_all_stopwords" | parallel -j1 --pipe -N1000 'rm_stop=$( cat | sed "$ s:|\\\\$::g" ); cat /tmp/{#} | eval "$rm_stop" > /tmp/$(expr {#} + 1); rm /tmp/{#}'
	# bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels
	# first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams
	# then we build sed commands around these stopwords, using word boundaries (eg \bRd\b)
	# and we ask for a case-insensitive match (eg sed 's/pattern//I')
	# then we string them together with pipes
	# and cat the file of interst and eval all the sed commands!
	# this removes the stopword list we build, with case-insensitive match, and using word boundaries
	# advantage over sed OR (eg sed 's/Rd\\|St//I') is that that can only do some many at a time
	# would be more efficient of course to batch these into groups of n

	rm_stopwords=$(
	n=6
	ngrams tiger.csv 1 \|\
	sortfreq \|\
	sed '1d' \|\
	awk "{if(length(\$2)<= $n \|\| \$2 ~ /^[^aeiou]*$/ )print \$0}"\|\
	head -n 35\|\
	sed '1 i\count\tstopword' \|\
	sed '1d' \|\
	tawk '{print $2}' \|\
	sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' \|\
	tr '\n' '\|' \|\
	sed 's:\|$::g'
	)
	cat tiger.csv \|\
	eval $rm_stopwords
	# make three distinct lists of stopwords out of tiger road fullname unigrams
	# NB: each list a subset of the inverse of the previous list
	# NB: uses dotfiles for data science
	# 1 too short or includes number
	# 2 has no vowels
	# 3 top _n_ most common whole words (not too short, has vowel, doesn't include number)
	# NB: be generous with the whole words list as you will want to curate it by hand at least a little to avoid making meaningful stuff into a stopword

	# take this many of the most common whole words
	ntop_wholeWords=400
	# consider any unigram shorter than this to be a stopword
	minlength=4
	# this is our temporary unigram list - we will progressively shorten this so that one type of list does not bleed into another type
	unigrams=$(mktemp)

	# just a temp file to store the street names - should rewrite ngram function to take STDIN actually
	tmp=$(mktemp)
	# make that list of unigrams
	cat fullnames_uniq.txt \| c 2 \| sed '1d'> $tmp
	# get a unique list of unigrams with counts of frequency
	ngrams $tmp 1 \| sortfreq > $unigrams
	# start building our 3 types of stopwords lists
	# first up is too short or contains a number
	stopwords_shortOrNum=$( cat $unigrams \| mawk "{if( \$2 ~ /[0-9]/\|\| length(\$2)< $minlength )print \$0}" )
	# save these stopwords to file
	echo "$stopwords_shortOrNum" \| c 2 > stop_shortOrNum
	# remove these matches from the unique frequency unigram list
	cat $unigrams \| grep -vFf <( echo "$stopwords_shortOrNum" ) \| sponge $unigrams
	# next up! make a stopword list when unigrams do not have an ASCII vowel
	stopwords_noVowel=$( cat $unigrams \| mawk "{if(\$2 ~ /^[^aeiouy]*$/ )print \$0}" )
	# write that list to file
	echo "$stopwords_noVowel" \| c 2 > stop_noVowel
	# remove it from the unique unigram freq list
	cat $unigrams \| grep -vFf <( echo "$stopwords_noVowel" ) \| sponge $unigrams
	# now get a list of the _n_ top unigrams that are left by frequency
	# be generous with the length of this list, as you may have to remove potential stopwords from it by hand
	# whole words are gold
	cat $unigrams \| mawk "{print \$0}" \| sed '1d' \| head -n $ntop_wholeWords \| c 2 > stop_wholeWords