gwerbin/housing.sh

## housing.sh
#!/usr/env bash

## Process the "Housing" data set from the UCI Machine Learning Data Repository

## download the "housing" dataset; this will also tell you how big it is
curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data -O

## check if it has carriage returns in line endings
if [[ -z $(egrep -l $'\r$' < housing.data) ]]; then echo 'no CR line endings'
else echo 'has CR line endings'
fi

## see how long the first row is (i.e. is it safe to print row by row?)
head -n1 housing.data | wc -c

## then see how many rows there are
wc -l housing.data

## then print the first few rows
head housing.data

## see that it's whitespace-delimited, with no spaces in the fields themselves
## strip out any leading/trailing whitespace and convert to comma-delimited with sed
sed -E -e 's/^[[:blank:]]+|[[:blank:]]+$//' -e 's/[[:blank:]]+/,/g' housing.data > housing.csv

## for some reason my version of sed doesn't like shorthand character classes, otherwise it'd be
# sed -E -e 's/^\s+|\s+$//' -e 's/\s+/,/g' housing.data > housing.csv

## or with AWK (which I like better)
# awk -v OFS=',' '{ $1=$1; print }' housing.data > housing.csv

## tabulate the 4th column
cut -d, -f4 housing.csv | sort -n | uniq -c

## copy the column names from the website
awk '{print $2}' << EOF | tr ':' ',' | tr -d '\n' | sed 's/.$//' | cat - housing.csv > housing_header.csv
1. CRIM: per capita crime rate by town
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3. INDUS: proportion of non-retail business acres per town
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
5. NOX: nitric oxides concentration (parts per 10 million)
6. RM: average number of rooms per dwelling
7. AGE: proportion of owner-occupied units built prior to 1940
8. DIS: weighted distances to five Boston employment centres
9. RAD: index of accessibility to radial highways
10. TAX: full-value property-tax rate per $10,000
11. PTRATIO: pupil-teacher ratio by town
12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. LSTAT: % lower status of the population
14. MEDV: Median value of owner-occupied homes in $1000's
EOF
mv -f housing_header.csv housing.csv
rm -f housing_header.csv

## cut the 9th and 14th columns for use in the homework assignment
cut -d, -f9,14 housing.csv > rad_medv.csv

## livy.sh
#!/usr/env bash

## Prepare Livy's 45 books for text processing, and plot the distribution of paragraph lengths in each book

## Figure out where the text files are stored
curl http://thelatinlibrary.com/liv.html | grep Liber

## Extract the file names and download each one
for book in $(curl http://thelatinlibrary.com/liv.html | grep Liber | egrep -o 'liv.\d+.shtml'); do
curl http://thelatinlibrary.com/livy/$book -O
done

## Total size
du -ch * | tail -n1

## check the length of the first file and scroll through it
wc -l liv.1.shtml
# less liv.1.shtml

## figure out how to un-HTML it
# brew search html
# brew install html2text
## this is pretty cool! http://www.mbayer.de/html2text/
for file in $(ls *.shtml); do
filebase=$(echo $file | tr '.' '_' | sed s/_shtml//)
html2text $file > ${filebase}.txt
done

# less liv_1.txt
## found some unicode character in there. I checked on the website and it's a badly-encoded em dash.
## also the first few lines and the last line need to be deleted.
for file in $(ls *.txt); do
iconv -f ISO-8859-1 -t ascii --unicode-subst="<U+%x>" $file | sed -e '1,7 d' -e '$ d' > ascii_$file
rm -f $file
done

## for each file, count the number of words in each paragraph and save it to a file for processing
for file in $(ls ascii*); do
tr '\n' ' ' < $file | perl -pe 's/ \[\d+\] /\n/g' | awk '{printf NF ","}' | sed 's/.$//' >> word_counts.csv
done

## add row names, but we generally don't want/need this
# echo "book"{1..35}"," | tr ' ' '\n' | paste -d'\0' - word_counts.csv > word_counts_2.csv
# mv -f word_counts_2.csv word_counts.csv

## Plot it in R
cat << EOF > word_counts.R
wc <- readLines("word_counts.csv")
wc <- strsplit(wc, ",")

for(i in 1:length(wc)) wc[[i]] <- cbind(i, unlist(wc[[i]]))
wc <- as.data.frame(Reduce(rbind, wc))
names(wc) <- c("book", "count")

png("word_counts.png", width = 600, height = 600)
boxplot(as.numeric(count)~book, data=wc,
main = "Distributions of paragraph lengths in Livy Books 1-45")
dev.off()
EOF

Rscript --vanilla word_counts.R
	#!/usr/env bash

	## Process the "Housing" data set from the UCI Machine Learning Data Repository

	## download the "housing" dataset; this will also tell you how big it is
	curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data -O

	## check if it has carriage returns in line endings
	if [[ -z $(egrep -l $'\r$' < housing.data) ]]; then echo 'no CR line endings'
	else echo 'has CR line endings'
	fi

	## see how long the first row is (i.e. is it safe to print row by row?)
	head -n1 housing.data \| wc -c

	## then see how many rows there are
	wc -l housing.data

	## then print the first few rows
	head housing.data

	## see that it's whitespace-delimited, with no spaces in the fields themselves
	## strip out any leading/trailing whitespace and convert to comma-delimited with sed
	sed -E -e 's/^[[:blank:]]+\|[[:blank:]]+$//' -e 's/[[:blank:]]+/,/g' housing.data > housing.csv

	## for some reason my version of sed doesn't like shorthand character classes, otherwise it'd be
	# sed -E -e 's/^\s+\|\s+$//' -e 's/\s+/,/g' housing.data > housing.csv

	## or with AWK (which I like better)
	# awk -v OFS=',' '{ $1=$1; print }' housing.data > housing.csv

	## tabulate the 4th column
	cut -d, -f4 housing.csv \| sort -n \| uniq -c

	## copy the column names from the website
	awk '{print $2}' << EOF \| tr ':' ',' \| tr -d '\n' \| sed 's/.$//' \| cat - housing.csv > housing_header.csv
	1. CRIM: per capita crime rate by town
	2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
	3. INDUS: proportion of non-retail business acres per town
	4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
	5. NOX: nitric oxides concentration (parts per 10 million)
	6. RM: average number of rooms per dwelling
	7. AGE: proportion of owner-occupied units built prior to 1940
	8. DIS: weighted distances to five Boston employment centres
	9. RAD: index of accessibility to radial highways
	10. TAX: full-value property-tax rate per $10,000
	11. PTRATIO: pupil-teacher ratio by town
	12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
	13. LSTAT: % lower status of the population
	14. MEDV: Median value of owner-occupied homes in $1000's
	EOF
	mv -f housing_header.csv housing.csv
	rm -f housing_header.csv

	## cut the 9th and 14th columns for use in the homework assignment
	cut -d, -f9,14 housing.csv > rad_medv.csv
	#!/usr/env bash

	## Prepare Livy's 45 books for text processing, and plot the distribution of paragraph lengths in each book

	## Figure out where the text files are stored
	curl http://thelatinlibrary.com/liv.html \| grep Liber

	## Extract the file names and download each one
	for book in $(curl http://thelatinlibrary.com/liv.html \| grep Liber \| egrep -o 'liv.\d+.shtml'); do
	curl http://thelatinlibrary.com/livy/$book -O
	done

	## Total size
	du -ch * \| tail -n1

	## check the length of the first file and scroll through it
	wc -l liv.1.shtml
	# less liv.1.shtml

	## figure out how to un-HTML it
	# brew search html
	# brew install html2text
	## this is pretty cool! http://www.mbayer.de/html2text/
	for file in $(ls *.shtml); do
	filebase=$(echo $file \| tr '.' '_' \| sed s/_shtml//)
	html2text $file > ${filebase}.txt
	done

	# less liv_1.txt
	## found some unicode character in there. I checked on the website and it's a badly-encoded em dash.
	## also the first few lines and the last line need to be deleted.
	for file in $(ls *.txt); do
	iconv -f ISO-8859-1 -t ascii --unicode-subst="<U+%x>" $file \| sed -e '1,7 d' -e '$ d' > ascii_$file
	rm -f $file
	done

	## for each file, count the number of words in each paragraph and save it to a file for processing
	for file in $(ls ascii*); do
	tr '\n' ' ' < $file \| perl -pe 's/ \[\d+\] /\n/g' \| awk '{printf NF ","}' \| sed 's/.$//' >> word_counts.csv
	done

	## add row names, but we generally don't want/need this
	# echo "book"{1..35}"," \| tr ' ' '\n' \| paste -d'\0' - word_counts.csv > word_counts_2.csv
	# mv -f word_counts_2.csv word_counts.csv

	## Plot it in R
	cat << EOF > word_counts.R
	wc <- readLines("word_counts.csv")
	wc <- strsplit(wc, ",")

	for(i in 1:length(wc)) wc[[i]] <- cbind(i, unlist(wc[[i]]))
	wc <- as.data.frame(Reduce(rbind, wc))
	names(wc) <- c("book", "count")

	png("word_counts.png", width = 600, height = 600)
	boxplot(as.numeric(count)~book, data=wc,
	main = "Distributions of paragraph lengths in Livy Books 1-45")
	dev.off()
	EOF

	Rscript --vanilla word_counts.R