Skip to content

Instantly share code, notes, and snippets.

@maxstarkenburg
Last active June 12, 2016 05:52
Show Gist options
  • Save maxstarkenburg/52fa422a017002076269 to your computer and use it in GitHub Desktop.
Save maxstarkenburg/52fa422a017002076269 to your computer and use it in GitHub Desktop.
A shell/gawk script that runs through the private comment field of my tab-separated LibraryThing export file, looks for monetary values I put there, converts them to dollars when necessary, adds up the total I've spent on books, and outputs a histogram of the amount of books I've bought per dollar amount.
#!/bin/sh
cat $1 |
# Remove the titles on the first line
sed 1d |
gawk '
BEGIN {
FS="\t";
print "MY BOOK SPENDING (approximate values), as of " strftime("%B %e, %Y at %H:%M %z") "\n";
empties=0;
library_total=0;
# This regex matches strings like "NN[.NN][euros]" with the first capture being the number and the second being the currency.
# The currency might be junk text like parens, in which case assume dollars.
quantity_currency="([0-9]+\\.?[0-9]*)([^ ]*)( |$)";
}
{
# The private comments field, where I am storing the purchase price (in English-ish), is 13th field in the export
private_comment=$13;
book_total=0;
# If there is nothing there, I did not have information about the price of that book. Get a total of the "unknowns"
if (private_comment == "") {
empties++;
# If there are matches for a money amounts, get those and total them up per book, then per entire library
} else if (match(private_comment, quantity_currency)) {
while (match(private_comment, quantity_currency, a)) {
quantity=a[1];
currency=a[2];
# Hack: checking for "2" because I have a few instances of "2 vols." or "2nd copy" that are matching, whereas others in dollars are always $2.00
if (quantity != 2) {
# Adjust the amounts for different currencies. These are of course approximate, and roughly calibrated to when I made the purchase
if (currency ~ /^mxn/) {
quantity /= 12;
} else if (currency ~ /^€/) {
quantity /= 0.75;
} else if (currency ~ /^cad/) {
quantity /= 1.05;
} else if (currency ~ /^clp/) {
quantity /= 530;
} else if (currency ~ /^gtq/) {
quantity /= 7.75;
} else if (currency ~ /^hdl/) {
quantity /= 2;
} else if (currency ~ /^gbp/) {
quantity /= 0.48;
}
}
# Maninpulate the string to look for other money amounts further along in the string
private_comment = substr(private_comment, RSTART + RLENGTH);
book_total += quantity;
}
# For books between, say, 1.00 and 1.99, return 1 so that we can group these for the histogram
lower_range = int(book_total);
# Make an exceptional value for books that were free, not just somewhere between 0.00 and 0.99
if (book_total == 0) lower_range = -1;
# Sum up the number of books bought at each dollar range, for histogram
counts[lower_range] = counts[lower_range] + 1;
# Add up the price of the current book to the total for the whole library
library_total += book_total;
# If there was some other problem (e.g. all text, no numbers)
} else {
print "DEBUG: There was a problem calculating the price for this book. Field value: " private_comment " \n";
}
}
function histogram_bars(total) {
if (total == "") total = 0;
for (c=1; c<=total; c+=5) {
printf "■";
}
if (total >=1) printf " %s", total;
printf " \n";
}
END {
long_tail = 40;
printf("Library volumes = %'\''d books\n", NR);
printf("Library total known cost = $%'\''.2f\n", library_total);
average_known = library_total / (NR - empties);
free_books = counts[-1];
average_purchased = library_total / (NR - empties - free_books);
printf("Average price for all books = $%.2f (For all books with a known price, including free ones)\n", average_known);
printf("Average purchased price = $%.2f (Excluding free books)\n", average_purchased);
printf("Library total probable cost¹ = $%'\''.2f\n\n", library_total + average_purchased * empties);
# Histogram stuff partially based on http://kuscsik.blogspot.cl/2008/02/how-to-create-histogram-using-awk.html
print ("Number of books bought in each dollar range:\n")
# a line for the ones without a known price
printf(" unknown: ");
histogram_bars(empties);
printf("\n");
# a line for each of dollar increment, up to a certain amount (with some special stuff for the free books)
for (i=-1; i<long_tail; i++) {
if (i==-1) {
printf(" free: ",i+1);
} else {
printf("$%05.2f-$%02d: ",i + .01,i+1);
}
histogram_bars(counts[i]);
}
# If they were more than a certain amount, say $40, group those together
long_tail_total=0;
for (count in counts) {
if (count >= long_tail) {
long_tail_total++
}
}
printf(" $%s+: ",long_tail);
histogram_bars(long_tail_total);
print "\n¹ Extrapolates unknown costs based on the average known purchase price";
}
'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment