Skip to content

Instantly share code, notes, and snippets.

@Andrew62
Created February 14, 2015 20:48
Show Gist options
  • Save Andrew62/19e7fb75685294eee79e to your computer and use it in GitHub Desktop.
Save Andrew62/19e7fb75685294eee79e to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 14 14:46:32 2015
@author: andrew
Data sources
code workshop 2
Script will:
write a script that will download the last 5
DOJ press releases using their REST API
Print the title of each doc and save the json for each
individual press release
www.justice.gov/developer/api-documentation/api_v1
IF YOU FINISH EARLY:
OpenCalais bit.ly/1vsO33f
Will do entity tagging via a rest API (NLP)
Need to sign up for a free account
Allen's answer bit.ly/1KVvH2a
"""
import requests
import json
import re
#DOJ rest api
#url = "http://www.justice.gov/api/v1/press_releases.json"
def slugify(value):
"""
Converts to ASCII. Converts spaces to hyphens. Removes characters that
aren't alphanumerics, underscores, or hyphens. Converts to lowercase.
Also strips leading and trailing whitespace.
Note: This is not production code
"""
value = value.encode('ascii', 'ignore').decode('ascii')
value = re.sub('[^\w\s-]', '', value).strip().lower()
return re.sub('[-\s]+', '-', value)
def get_releases(url):
"""
gets the resources from the url and returns
results as json
="""
#headers = {} <-- contains headers
#can pass headers into requests.get(url, headers=headers
response = requests.get(url)
if response.ok:
formatted = response.json()
return formatted["results"]
def write_json_file(item):
"""
Writes json object to json file
"""
title = slugify(item["title"])
with open("{0}.json".format(title), "w") as target:
json.dump(item, target, indent=4, sort_keys=True)
def use_open_calais():
#site url http://www.opencalais.com/documentation/calais-web-service-api/forming-api-calls
#check your downloads folder for the python wrapper
pass
def main():
url = "http://www.justice.gov/api/v1/press_releases.json?pagesize=5"
releases = get_releases(url)
for release in releases:
print release["title"]
write_json_file(release)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment