Created
December 14, 2012 15:58
-
-
Save yanofsky/4286461 to your computer and use it in GitHub Desktop.
The way I calculated the amount of printing, and number of lines, on every Form 1040 ever
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
from PIL import Image | |
import json | |
def calculate_black_stats(): | |
data = {} | |
files = glob.glob("png/*.png") | |
for f in files: | |
year = f.split(".pdf")[0].split("__")[1] | |
page = int(f.split("-")[1].split(".png")[0]) | |
if year not in data: | |
data[year] = {"pages":[0,0,0,0,0,0,0,0,0,0,0,0,0],"total":0, "size":0} | |
image = Image.open(f) | |
data[year]["pages"][page] = amount_of_black(image)[0][0] | |
w, h = image.size | |
data[year]["size"] += (w * h) | |
print year, page, data[year]["pages"][page] | |
for year in data: | |
for page in data[year]["pages"]: | |
data[year]["total"] += page | |
print "*******************\n\n\n\n" | |
for year in data: | |
print year, page, data[year]["total"] | |
f = open("outdataCrop.json","w") | |
f.write(json.dumps(data,indent=4)) | |
f.close() | |
f = open("outdataCrop.csv","w") | |
f.write(",".join(["year","size","total","ratio","0","1","2","3","4","5","6","7","8","9","10"])+"\n") | |
for year in data: | |
f.write("%s,%s,%s,%s,%s\n" % (year,data[year]["size"],data[year]["total"],data[year]["total"]/data[year]["size"],",".join([str(s) for s in data[year]["pages"]]),)) | |
f.close() | |
pass | |
def amount_of_black(image): | |
i = image | |
i = i.convert("L") | |
return i.getcolors() | |
calculate_black_stats() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
from PIL import Image | |
import json | |
def interactive_line_count(): | |
data = {} | |
files = glob.glob("png/*.png") | |
for f in files: | |
year = f.split(".pdf")[0].split("__")[1] | |
page = int(f.split("-")[1].split(".png")[0]) | |
if year not in data: | |
data[year] = 0 | |
image = Image.open(f) | |
image.show() | |
data[year] = ask_for_data(data[year]) | |
print year, "now at:", data[year] | |
f = open("numlines.json","w") | |
f.write(json.dumps(data,indent=4)) | |
f.close() | |
f = open("numlines.csv","w") | |
f.write("year,number of lines\n") | |
for year in data: | |
f.write("%s,%s\n" % (year,data[year])) | |
f.close() | |
def ask_for_data(val): | |
value = raw_input("How Many Lines? ") | |
if "r" in value: | |
val = int(value[1:]) | |
elif "s" in value: | |
None | |
elif "a" in value: | |
val += int(value[1:]) | |
else: | |
print "ERROR, try again" | |
val = ask_for_data(val) | |
return val | |
interactive_line_count() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment