Last active
August 29, 2015 14:09
-
-
Save gwerbin/003877ae8abc55ec4057 to your computer and use it in GitHub Desktop.
Data processing in Bash -- some examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/env bash | |
## Process the "Housing" data set from the UCI Machine Learning Data Repository | |
## download the "housing" dataset; this will also tell you how big it is | |
curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data -O | |
## check if it has carriage returns in line endings | |
if [[ -z $(egrep -l $'\r$' < housing.data) ]]; then echo 'no CR line endings' | |
else echo 'has CR line endings' | |
fi | |
## see how long the first row is (i.e. is it safe to print row by row?) | |
head -n1 housing.data | wc -c | |
## then see how many rows there are | |
wc -l housing.data | |
## then print the first few rows | |
head housing.data | |
## see that it's whitespace-delimited, with no spaces in the fields themselves | |
## strip out any leading/trailing whitespace and convert to comma-delimited with sed | |
sed -E -e 's/^[[:blank:]]+|[[:blank:]]+$//' -e 's/[[:blank:]]+/,/g' housing.data > housing.csv | |
## for some reason my version of sed doesn't like shorthand character classes, otherwise it'd be | |
# sed -E -e 's/^\s+|\s+$//' -e 's/\s+/,/g' housing.data > housing.csv | |
## or with AWK (which I like better) | |
# awk -v OFS=',' '{ $1=$1; print }' housing.data > housing.csv | |
## tabulate the 4th column | |
cut -d, -f4 housing.csv | sort -n | uniq -c | |
## copy the column names from the website | |
awk '{print $2}' << EOF | tr ':' ',' | tr -d '\n' | sed 's/.$//' | cat - housing.csv > housing_header.csv | |
1. CRIM: per capita crime rate by town | |
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. | |
3. INDUS: proportion of non-retail business acres per town | |
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) | |
5. NOX: nitric oxides concentration (parts per 10 million) | |
6. RM: average number of rooms per dwelling | |
7. AGE: proportion of owner-occupied units built prior to 1940 | |
8. DIS: weighted distances to five Boston employment centres | |
9. RAD: index of accessibility to radial highways | |
10. TAX: full-value property-tax rate per $10,000 | |
11. PTRATIO: pupil-teacher ratio by town | |
12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | |
13. LSTAT: % lower status of the population | |
14. MEDV: Median value of owner-occupied homes in $1000's | |
EOF | |
mv -f housing_header.csv housing.csv | |
rm -f housing_header.csv | |
## cut the 9th and 14th columns for use in the homework assignment | |
cut -d, -f9,14 housing.csv > rad_medv.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/env bash | |
## Prepare Livy's 45 books for text processing, and plot the distribution of paragraph lengths in each book | |
## Figure out where the text files are stored | |
curl http://thelatinlibrary.com/liv.html | grep Liber | |
## Extract the file names and download each one | |
for book in $(curl http://thelatinlibrary.com/liv.html | grep Liber | egrep -o 'liv.\d+.shtml'); do | |
curl http://thelatinlibrary.com/livy/$book -O | |
done | |
## Total size | |
du -ch * | tail -n1 | |
## check the length of the first file and scroll through it | |
wc -l liv.1.shtml | |
# less liv.1.shtml | |
## figure out how to un-HTML it | |
# brew search html | |
# brew install html2text | |
## this is pretty cool! http://www.mbayer.de/html2text/ | |
for file in $(ls *.shtml); do | |
filebase=$(echo $file | tr '.' '_' | sed s/_shtml//) | |
html2text $file > ${filebase}.txt | |
done | |
# less liv_1.txt | |
## found some unicode character in there. I checked on the website and it's a badly-encoded em dash. | |
## also the first few lines and the last line need to be deleted. | |
for file in $(ls *.txt); do | |
iconv -f ISO-8859-1 -t ascii --unicode-subst="<U+%x>" $file | sed -e '1,7 d' -e '$ d' > ascii_$file | |
rm -f $file | |
done | |
## for each file, count the number of words in each paragraph and save it to a file for processing | |
for file in $(ls ascii*); do | |
tr '\n' ' ' < $file | perl -pe 's/ \[\d+\] /\n/g' | awk '{printf NF ","}' | sed 's/.$//' >> word_counts.csv | |
done | |
## add row names, but we generally don't want/need this | |
# echo "book"{1..35}"," | tr ' ' '\n' | paste -d'\0' - word_counts.csv > word_counts_2.csv | |
# mv -f word_counts_2.csv word_counts.csv | |
## Plot it in R | |
cat << EOF > word_counts.R | |
wc <- readLines("word_counts.csv") | |
wc <- strsplit(wc, ",") | |
for(i in 1:length(wc)) wc[[i]] <- cbind(i, unlist(wc[[i]])) | |
wc <- as.data.frame(Reduce(rbind, wc)) | |
names(wc) <- c("book", "count") | |
png("word_counts.png", width = 600, height = 600) | |
boxplot(as.numeric(count)~book, data=wc, | |
main = "Distributions of paragraph lengths in Livy Books 1-45") | |
dev.off() | |
EOF | |
Rscript --vanilla word_counts.R |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment