A shell/gawk script that runs through the private comment field of my tab-separated LibraryThing export file, looks for monetary values I put there, converts them to dollars when necessary, adds up the total I've spent on books, and outputs a histogram of the amount of books I've bought per dollar amount.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
cat $1 | | |
# Remove the titles on the first line | |
sed 1d | | |
gawk ' | |
BEGIN { | |
FS="\t"; | |
print "MY BOOK SPENDING (approximate values), as of " strftime("%B %e, %Y at %H:%M %z") "\n"; | |
empties=0; | |
library_total=0; | |
# This regex matches strings like "NN[.NN][euros]" with the first capture being the number and the second being the currency. | |
# The currency might be junk text like parens, in which case assume dollars. | |
quantity_currency="([0-9]+\\.?[0-9]*)([^ ]*)( |$)"; | |
} | |
{ | |
# The private comments field, where I am storing the purchase price (in English-ish), is 13th field in the export | |
private_comment=$13; | |
book_total=0; | |
# If there is nothing there, I did not have information about the price of that book. Get a total of the "unknowns" | |
if (private_comment == "") { | |
empties++; | |
# If there are matches for a money amounts, get those and total them up per book, then per entire library | |
} else if (match(private_comment, quantity_currency)) { | |
while (match(private_comment, quantity_currency, a)) { | |
quantity=a[1]; | |
currency=a[2]; | |
# Hack: checking for "2" because I have a few instances of "2 vols." or "2nd copy" that are matching, whereas others in dollars are always $2.00 | |
if (quantity != 2) { | |
# Adjust the amounts for different currencies. These are of course approximate, and roughly calibrated to when I made the purchase | |
if (currency ~ /^mxn/) { | |
quantity /= 12; | |
} else if (currency ~ /^€/) { | |
quantity /= 0.75; | |
} else if (currency ~ /^cad/) { | |
quantity /= 1.05; | |
} else if (currency ~ /^clp/) { | |
quantity /= 530; | |
} else if (currency ~ /^gtq/) { | |
quantity /= 7.75; | |
} else if (currency ~ /^hdl/) { | |
quantity /= 2; | |
} else if (currency ~ /^gbp/) { | |
quantity /= 0.48; | |
} | |
} | |
# Maninpulate the string to look for other money amounts further along in the string | |
private_comment = substr(private_comment, RSTART + RLENGTH); | |
book_total += quantity; | |
} | |
# For books between, say, 1.00 and 1.99, return 1 so that we can group these for the histogram | |
lower_range = int(book_total); | |
# Make an exceptional value for books that were free, not just somewhere between 0.00 and 0.99 | |
if (book_total == 0) lower_range = -1; | |
# Sum up the number of books bought at each dollar range, for histogram | |
counts[lower_range] = counts[lower_range] + 1; | |
# Add up the price of the current book to the total for the whole library | |
library_total += book_total; | |
# If there was some other problem (e.g. all text, no numbers) | |
} else { | |
print "DEBUG: There was a problem calculating the price for this book. Field value: " private_comment " \n"; | |
} | |
} | |
function histogram_bars(total) { | |
if (total == "") total = 0; | |
for (c=1; c<=total; c+=5) { | |
printf "■"; | |
} | |
if (total >=1) printf " %s", total; | |
printf " \n"; | |
} | |
END { | |
long_tail = 40; | |
printf("Library volumes = %'\''d books\n", NR); | |
printf("Library total known cost = $%'\''.2f\n", library_total); | |
average_known = library_total / (NR - empties); | |
free_books = counts[-1]; | |
average_purchased = library_total / (NR - empties - free_books); | |
printf("Average price for all books = $%.2f (For all books with a known price, including free ones)\n", average_known); | |
printf("Average purchased price = $%.2f (Excluding free books)\n", average_purchased); | |
printf("Library total probable cost¹ = $%'\''.2f\n\n", library_total + average_purchased * empties); | |
# Histogram stuff partially based on http://kuscsik.blogspot.cl/2008/02/how-to-create-histogram-using-awk.html | |
print ("Number of books bought in each dollar range:\n") | |
# a line for the ones without a known price | |
printf(" unknown: "); | |
histogram_bars(empties); | |
printf("\n"); | |
# a line for each of dollar increment, up to a certain amount (with some special stuff for the free books) | |
for (i=-1; i<long_tail; i++) { | |
if (i==-1) { | |
printf(" free: ",i+1); | |
} else { | |
printf("$%05.2f-$%02d: ",i + .01,i+1); | |
} | |
histogram_bars(counts[i]); | |
} | |
# If they were more than a certain amount, say $40, group those together | |
long_tail_total=0; | |
for (count in counts) { | |
if (count >= long_tail) { | |
long_tail_total++ | |
} | |
} | |
printf(" $%s+: ",long_tail); | |
histogram_bars(long_tail_total); | |
print "\n¹ Extrapolates unknown costs based on the average known purchase price"; | |
} | |
' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment