conorsch/stn-data.py

## stn-data.py
#!/usr/bin/env python3
import pandas
import requests
import matplotlib.pyplot as plt


# URL for SecureTheNews API
url = 'https://securethe.news/api/v1/sites/?format=json'

# URL template for retrieving historical scan data for a specific domain.
# Querying historical data for all sites involves a two-step API process.
site_url = 'https://securethe.news/api/v1/sites/{}/scans/'

# Scan data is dense! We have roughly three scans per day per domain.
# That's dense enough, with little enough change, that it clutters graphs.
# Let's instead skip through the historical scans and take a subset,
# e.g. every 500th data point gets graphed.
scan_data_density = 500

# Retrieve JSON blob from API
all_sites_request = requests.get(url)
all_sites = all_sites_request.json()['results']

# Shove JSON into dataframe (not using for now)
df = pandas.io.json.json_normalize(all_sites)
# print(df)

# Cheat and use simple data structures for munging data
scores = []
timestamps = []

# Loop through all sites referenced by the API.
# Extract the domain mentioned, then make a second API call
# to get historical data per domain (perhaps API v2 should
# consolidate the two-step process, to make the API more straightforward).
total_num_sites = len(all_sites)
for i, site in enumerate(all_sites):
    # Extract target domain info list of domains.
    domain = site['domain']
    print("Evaluating {} ({}/{})".format(domain, i, total_num_sites))
    scans_url = site_url.format(domain)
    site_request = requests.get(scans_url)
    site_scan_results = site_request.json()['results']

    # Second two in the API process: dig deeply on one domain
    print("Analyzing historical data for {}".format(domain))
    # Too much scan data... let's filter some out.
    for scan in site_scan_results[0:-1:scan_data_density]:
        timestamp = scan['timestamp']
        timestamps.append(timestamp)
        score = scan['score']
        scores.append(score)

# Example URL for one org:
# https://securethe.news/api/v1/sites/slate.com/scans/?limit=100&offset=100

# Graph that business!
print("Generating pyplot scatter chart...")
plt.scatter(timestamps, scores)
plt.title("SecureTheNews")
plt.xlabel("Timestamp")
plt.ylabel("Numeric grade")
plt.show()

# Try a stacked bar chart. Should show upward shift in grade distribution
# over time.
# TODO: We'll need to chunk the x-axis into buckets, though, presumably
# months, in order to stack the results.
print("Generating pyplot stacked bar chart...")
plt.bar(timestamps, scores)
plt.title("SecureTheNews")
plt.xlabel("Timestamp")
plt.ylabel("Numeric grade")
plt.show()
	#!/usr/bin/env python3
	import pandas
	import requests
	import matplotlib.pyplot as plt


	# URL for SecureTheNews API
	url = 'https://securethe.news/api/v1/sites/?format=json'

	# URL template for retrieving historical scan data for a specific domain.
	# Querying historical data for all sites involves a two-step API process.
	site_url = 'https://securethe.news/api/v1/sites/{}/scans/'

	# Scan data is dense! We have roughly three scans per day per domain.
	# That's dense enough, with little enough change, that it clutters graphs.
	# Let's instead skip through the historical scans and take a subset,
	# e.g. every 500th data point gets graphed.
	scan_data_density = 500

	# Retrieve JSON blob from API
	all_sites_request = requests.get(url)
	all_sites = all_sites_request.json()['results']

	# Shove JSON into dataframe (not using for now)
	df = pandas.io.json.json_normalize(all_sites)
	# print(df)

	# Cheat and use simple data structures for munging data
	scores = []
	timestamps = []

	# Loop through all sites referenced by the API.
	# Extract the domain mentioned, then make a second API call
	# to get historical data per domain (perhaps API v2 should
	# consolidate the two-step process, to make the API more straightforward).
	total_num_sites = len(all_sites)
	for i, site in enumerate(all_sites):
	# Extract target domain info list of domains.
	domain = site['domain']
	print("Evaluating {} ({}/{})".format(domain, i, total_num_sites))
	scans_url = site_url.format(domain)
	site_request = requests.get(scans_url)
	site_scan_results = site_request.json()['results']

	# Second two in the API process: dig deeply on one domain
	print("Analyzing historical data for {}".format(domain))
	# Too much scan data... let's filter some out.
	for scan in site_scan_results[0:-1:scan_data_density]:
	timestamp = scan['timestamp']
	timestamps.append(timestamp)
	score = scan['score']
	scores.append(score)

	# Example URL for one org:
	# https://securethe.news/api/v1/sites/slate.com/scans/?limit=100&offset=100

	# Graph that business!
	print("Generating pyplot scatter chart...")
	plt.scatter(timestamps, scores)
	plt.title("SecureTheNews")
	plt.xlabel("Timestamp")
	plt.ylabel("Numeric grade")
	plt.show()

	# Try a stacked bar chart. Should show upward shift in grade distribution
	# over time.
	# TODO: We'll need to chunk the x-axis into buckets, though, presumably
	# months, in order to stack the results.
	print("Generating pyplot stacked bar chart...")
	plt.bar(timestamps, scores)
	plt.title("SecureTheNews")
	plt.xlabel("Timestamp")
	plt.ylabel("Numeric grade")
	plt.show()