cbuntain/crisisfacts.downloader.py

## crisisfacts.downloader.py
import json
import requests
import pandas as pd

### Necessary Credentials
credentials = {
    "institution": "University of Maryland, College Park", # University, Company or Public Agency Name
    "contactname": "Cody Buntain", # Your Name
    "email": "cbuntain@umd.edu", # A contact email address
    "institutiontype": "Research" # Either 'Research', 'Industry', or 'Public Sector'
}

### What event and date do you want?
data = {
    'eventID': "007",
    'requestDate': "2020-08-27",
}

# Code below populates all_content with this event+day pair
all_content = []
url_base = "http://demos.terrier.org/crisisfacts/"

## Register with the Terrier server
data.update(credentials)
resp = requests.post(url_base + "register", json=data)
resp_dict = resp.json()

## Using the access key, consume data from the stream
### Continue until we have
run_count = 0
while resp.status_code == 200:

    print("Run:", run_count)

    # Stream data down
    ## Note that we need a high timeout here (>30 seconds seems to work for me)
    resp = requests.get(url_base + "stream", params={"accessKey": resp_dict["accessKey"]}, timeout=60)

    # Go from bytes to a string, since we're expecting JSON data
    data = resp.content.decode("utf8")

    # If no data, we've finished
    if len(data) == 0:
        print("Datastream exhausted")
        break

    # Convert from string to JSON
    parsed_content = json.loads(data)

    # And extend the all_content array with this new data
    all_content.extend(parsed_content)

    run_count += 1

# Convert documents to Pandas DataFrame
df = pd.DataFrame(all_content)

df["sourceType"].value_counts()
	import json
	import requests
	import pandas as pd

	### Necessary Credentials
	credentials = {
	"institution": "University of Maryland, College Park", # University, Company or Public Agency Name
	"contactname": "Cody Buntain", # Your Name
	"email": "cbuntain@umd.edu", # A contact email address
	"institutiontype": "Research" # Either 'Research', 'Industry', or 'Public Sector'
	}

	### What event and date do you want?
	data = {
	'eventID': "007",
	'requestDate': "2020-08-27",
	}

	# Code below populates all_content with this event+day pair
	all_content = []
	url_base = "http://demos.terrier.org/crisisfacts/"

	## Register with the Terrier server
	data.update(credentials)
	resp = requests.post(url_base + "register", json=data)
	resp_dict = resp.json()

	## Using the access key, consume data from the stream
	### Continue until we have
	run_count = 0
	while resp.status_code == 200:

	print("Run:", run_count)

	# Stream data down
	## Note that we need a high timeout here (>30 seconds seems to work for me)
	resp = requests.get(url_base + "stream", params={"accessKey": resp_dict["accessKey"]}, timeout=60)

	# Go from bytes to a string, since we're expecting JSON data
	data = resp.content.decode("utf8")

	# If no data, we've finished
	if len(data) == 0:
	print("Datastream exhausted")
	break

	# Convert from string to JSON
	parsed_content = json.loads(data)

	# And extend the all_content array with this new data
	all_content.extend(parsed_content)

	run_count += 1

	# Convert documents to Pandas DataFrame
	df = pd.DataFrame(all_content)

	df["sourceType"].value_counts()