Skip to content

Instantly share code, notes, and snippets.

@cbuntain
Last active July 11, 2022 18:12
Show Gist options
  • Save cbuntain/27109e525ca867d966350fbc63f8bcfc to your computer and use it in GitHub Desktop.
Save cbuntain/27109e525ca867d966350fbc63f8bcfc to your computer and use it in GitHub Desktop.
CrisisFACTS Direct Download
import json
import requests
import pandas as pd
### Necessary Credentials
credentials = {
"institution": "University of Maryland, College Park", # University, Company or Public Agency Name
"contactname": "Cody Buntain", # Your Name
"email": "cbuntain@umd.edu", # A contact email address
"institutiontype": "Research" # Either 'Research', 'Industry', or 'Public Sector'
}
### What event and date do you want?
data = {
'eventID': "007",
'requestDate': "2020-08-27",
}
# Code below populates all_content with this event+day pair
all_content = []
url_base = "http://demos.terrier.org/crisisfacts/"
## Register with the Terrier server
data.update(credentials)
resp = requests.post(url_base + "register", json=data)
resp_dict = resp.json()
## Using the access key, consume data from the stream
### Continue until we have
run_count = 0
while resp.status_code == 200:
print("Run:", run_count)
# Stream data down
## Note that we need a high timeout here (>30 seconds seems to work for me)
resp = requests.get(url_base + "stream", params={"accessKey": resp_dict["accessKey"]}, timeout=60)
# Go from bytes to a string, since we're expecting JSON data
data = resp.content.decode("utf8")
# If no data, we've finished
if len(data) == 0:
print("Datastream exhausted")
break
# Convert from string to JSON
parsed_content = json.loads(data)
# And extend the all_content array with this new data
all_content.extend(parsed_content)
run_count += 1
# Convert documents to Pandas DataFrame
df = pd.DataFrame(all_content)
df["sourceType"].value_counts()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment