A shell/gawk script that runs through the private comment field of my tab-separated LibraryThing export file, looks for monetary values I put there, converts them to dollars when necessary, adds up the total I've spent on books, and outputs a histogram of the amount of books I've bought per dollar amount.
#!/bin/sh | |
cat $1 | | |
# Remove the titles on the first line | |
sed 1d | | |
gawk ' | |
BEGIN { | |
FS="\t"; | |
print "MY BOOK SPENDING (approximate values), as of " strftime("%B %e, %Y at %H:%M %z") "\n"; | |
empties=0; | |
library_total=0; | |
# This regex matches strings like "NN[.NN][euros]" with the first capture being the number and the second being the currency. | |
# The currency might be junk text like parens, in which case assume dollars. | |
quantity_currency="([0-9]+\\.?[0-9]*)([^ ]*)( |$)"; | |
} | |
{ | |
# The private comments field, where I am storing the purchase price (in English-ish), is 13th field in the export | |
private_comment=$13; | |
book_total=0; | |
# If there is nothing there, I did not have information about the price of that book. Get a total of the "unknowns" | |
if (private_comment == "") { | |
empties++; | |
# If there are matches for a money amounts, get those and total them up per book, then per entire library | |
} else if (match(private_comment, quantity_currency)) { | |
while (match(private_comment, quantity_currency, a)) { | |
quantity=a[1]; | |
currency=a[2]; | |
# Hack: checking for "2" because I have a few instances of "2 vols." or "2nd copy" that are matching, whereas others in dollars are always $2.00 | |
if (quantity != 2) { | |
# Adjust the amounts for different currencies. These are of course approximate, and roughly calibrated to when I made the purchase | |
if (currency ~ /^mxn/) { | |
quantity /= 12; | |
} else if (currency ~ /^€/) { | |
quantity /= 0.75; | |
} else if (currency ~ /^cad/) { | |
quantity /= 1.05; | |
} else if (currency ~ /^clp/) { | |
quantity /= 530; | |
} else if (currency ~ /^gtq/) { | |
quantity /= 7.75; | |
} else if (currency ~ /^hdl/) { | |
quantity /= 2; | |
} else if (currency ~ /^gbp/) { | |
quantity /= 0.48; | |
} | |
} | |
# Maninpulate the string to look for other money amounts further along in the string | |
private_comment = substr(private_comment, RSTART + RLENGTH); | |
book_total += quantity; | |
} | |
# For books between, say, 1.00 and 1.99, return 1 so that we can group these for the histogram | |
lower_range = int(book_total); | |
# Make an exceptional value for books that were free, not just somewhere between 0.00 and 0.99 | |
if (book_total == 0) lower_range = -1; | |
# Sum up the number of books bought at each dollar range, for histogram | |
counts[lower_range] = counts[lower_range] + 1; | |
# Add up the price of the current book to the total for the whole library | |
library_total += book_total; | |
# If there was some other problem (e.g. all text, no numbers) | |
} else { | |
print "DEBUG: There was a problem calculating the price for this book. Field value: " private_comment " \n"; | |
} | |
} | |
function histogram_bars(total) { | |
if (total == "") total = 0; | |
for (c=1; c<=total; c+=5) { | |
printf "■"; | |
} | |
if (total >=1) printf " %s", total; | |
printf " \n"; | |
} | |
END { | |
long_tail = 40; | |
printf("Library volumes = %'\''d books\n", NR); | |
printf("Library total known cost = $%'\''.2f\n", library_total); | |
average_known = library_total / (NR - empties); | |
free_books = counts[-1]; | |
average_purchased = library_total / (NR - empties - free_books); | |
printf("Average price for all books = $%.2f (For all books with a known price, including free ones)\n", average_known); | |
printf("Average purchased price = $%.2f (Excluding free books)\n", average_purchased); | |
printf("Library total probable cost¹ = $%'\''.2f\n\n", library_total + average_purchased * empties); | |
# Histogram stuff partially based on http://kuscsik.blogspot.cl/2008/02/how-to-create-histogram-using-awk.html | |
print ("Number of books bought in each dollar range:\n") | |
# a line for the ones without a known price | |
printf(" unknown: "); | |
histogram_bars(empties); | |
printf("\n"); | |
# a line for each of dollar increment, up to a certain amount (with some special stuff for the free books) | |
for (i=-1; i<long_tail; i++) { | |
if (i==-1) { | |
printf(" free: ",i+1); | |
} else { | |
printf("$%05.2f-$%02d: ",i + .01,i+1); | |
} | |
histogram_bars(counts[i]); | |
} | |
# If they were more than a certain amount, say $40, group those together | |
long_tail_total=0; | |
for (count in counts) { | |
if (count >= long_tail) { | |
long_tail_total++ | |
} | |
} | |
printf(" $%s+: ",long_tail); | |
histogram_bars(long_tail_total); | |
print "\n¹ Extrapolates unknown costs based on the average known purchase price"; | |
} | |
' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment