Skip to content

Instantly share code, notes, and snippets.

@greglinch
Last active January 29, 2017 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save greglinch/8adcf412448775e01b69 to your computer and use it in GitHub Desktop.
Save greglinch/8adcf412448775e01b69 to your computer and use it in GitHub Desktop.
Upload PDFs from URLs in csv to DocumentCloud.org using Ben Welsh's python-documentcloud API wrapper https://python-documentcloud.readthedocs.io/en/latest/gettingstarted.html#uploading-a-pdf-from-a-url
from documentcloud import DocumentCloud
import urllib, cStringIO, csv
## Create the DocumentCloud.org client
client = DocumentCloud("USERNAME", "PASSWORD")
## Set additional data to store with document by mapping csv field keys to new values that will be they keys on Document Cloud
## you could abstract this by providing these key-value pairs in a separate csv, then supplying the data csv and field mapping csv as args in the command line
field_mapping = {
"wpid": "wpid",
"name": "org_name",
"city": "org_city",
"state": "org_state",
"year": "org_year",
"revenue": "org_revenue",
"org_type": "org_type",
"desc": "org_description",
"docurl": "source_url",
"ein": "org_ein"
}
def upload_doc(data_dict):
"""
Map fields from csv to Document Cloud fields and upload
"""
## create dict
clean_data_kwargs = {}
## map the old values as the new keys and the old keys as the new values
for key, value in data_dict.items():
new_key = field_mapping[key]
clean_data_kwargs[new_key] = value
## Download the URL with urllib
url = clean_data_kwargs["source_url"]
file_contents = urllib.urlopen(url).read()
## Stuff it in a file object with cStringIO
file_obj = cStringIO.StringIO(file_contents)
## Set kwargs for documentcloud.org
kwargs = {
"title": clean_data_kwargs["org_name"] + " - " + clean_data_kwargs["org_year"], # update as needed
"source": "SOURCE",
"description": "DESC",
"access": "ACCESS",
"project": "PROJ",
"data": clean_data_kwargs, # optional
"secure": False # or True if you don't want to send docs to OpenCalais
}
## Upload that to DocumentCloud
obj = client.documents.upload(file_obj, **kwargs)
print "Uploaded: %s" % (kwargs["title"])
print "\n"
## set the file name of the csv with all your urls and doc metadata
filename = "FILENAME.csv"
## open the csv
with open(filename, 'rb') as handle:
## read the csv
reader = csv.DictReader(handle)
## loop thru the rows
for row in reader:
## pass each row to the function
upload_doc(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment