Created
June 6, 2013 19:52
-
-
Save lukerosiak/5724394 to your computer and use it in GitHub Desktop.
Download text files representing OCR'd images of IRS Form 990s, corresponding to the URL scheme at bulk.resource.org/irs.gov/eo. Metadata for each file, such as name of nonprofit, IRS EIN, and year, is available in the "manifest" files there. A parser for those is available at github.com/lukerosiak/irs/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import boto | |
""" | |
Mirror the entire nonprofittext S3 bucket, downloading only files that aren't already present or if the S3 version is larger than the one we have. | |
The only dependency is boto. To install: pip install boto | |
To run: python download.py | |
Set DEST_PATH to the directory you want them in. | |
""" | |
#DEST_PATH = '/media/sf_bulknobackup/nonprofittext' | |
DEST_PATH = '/irs.gov/eo/raw' | |
try: | |
os.mkdir(DEST_PATH) | |
except: | |
pass | |
os.chdir(DEST_PATH) | |
conn_s3 = boto.connect_s3(anon=True) | |
bucket = conn_s3.get_bucket('nonprofittext') | |
status = {'new': 0, 'updated': 0, 'skipped-exists': 0} | |
for k in bucket: | |
name = k.name | |
if not os.path.exists(name): | |
try: | |
os.mkdir(name.split('/')[0]) | |
except: | |
pass | |
k.get_contents_to_filename(name) | |
status['new']+=1 | |
elif k.size > os.path.getsize(name)+10: | |
k.get_contents_to_filename(name) | |
print 'replacing', name | |
status['updated']+=1 | |
else: | |
status['skipped-exists']+=1 | |
if (status['new']+status['updated']+status['skipped-exists']) % 2000==0: | |
print 'working... processed %s files... %s new, %s updated, %s no change.' % \ | |
((status['new']+status['updated']+status['skipped-exists']), status['new'], status['updated'], status['skipped-exists']) | |
print 'mirror complete.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment