Created
February 14, 2015 20:48
-
-
Save Andrew62/19e7fb75685294eee79e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Feb 14 14:46:32 2015 | |
@author: andrew | |
Data sources | |
code workshop 2 | |
Script will: | |
write a script that will download the last 5 | |
DOJ press releases using their REST API | |
Print the title of each doc and save the json for each | |
individual press release | |
www.justice.gov/developer/api-documentation/api_v1 | |
IF YOU FINISH EARLY: | |
OpenCalais bit.ly/1vsO33f | |
Will do entity tagging via a rest API (NLP) | |
Need to sign up for a free account | |
Allen's answer bit.ly/1KVvH2a | |
""" | |
import requests | |
import json | |
import re | |
#DOJ rest api | |
#url = "http://www.justice.gov/api/v1/press_releases.json" | |
def slugify(value): | |
""" | |
Converts to ASCII. Converts spaces to hyphens. Removes characters that | |
aren't alphanumerics, underscores, or hyphens. Converts to lowercase. | |
Also strips leading and trailing whitespace. | |
Note: This is not production code | |
""" | |
value = value.encode('ascii', 'ignore').decode('ascii') | |
value = re.sub('[^\w\s-]', '', value).strip().lower() | |
return re.sub('[-\s]+', '-', value) | |
def get_releases(url): | |
""" | |
gets the resources from the url and returns | |
results as json | |
=""" | |
#headers = {} <-- contains headers | |
#can pass headers into requests.get(url, headers=headers | |
response = requests.get(url) | |
if response.ok: | |
formatted = response.json() | |
return formatted["results"] | |
def write_json_file(item): | |
""" | |
Writes json object to json file | |
""" | |
title = slugify(item["title"]) | |
with open("{0}.json".format(title), "w") as target: | |
json.dump(item, target, indent=4, sort_keys=True) | |
def use_open_calais(): | |
#site url http://www.opencalais.com/documentation/calais-web-service-api/forming-api-calls | |
#check your downloads folder for the python wrapper | |
pass | |
def main(): | |
url = "http://www.justice.gov/api/v1/press_releases.json?pagesize=5" | |
releases = get_releases(url) | |
for release in releases: | |
print release["title"] | |
write_json_file(release) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment