Created
January 12, 2018 17:57
-
-
Save conorsch/c0c3ee0dfef46444c8c7216054ac7353 to your computer and use it in GitHub Desktop.
First pass at retrieving historical data from Secure The News API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pandas | |
import requests | |
import matplotlib.pyplot as plt | |
# URL for SecureTheNews API | |
url = 'https://securethe.news/api/v1/sites/?format=json' | |
# URL template for retrieving historical scan data for a specific domain. | |
# Querying historical data for all sites involves a two-step API process. | |
site_url = 'https://securethe.news/api/v1/sites/{}/scans/' | |
# Scan data is dense! We have roughly three scans per day per domain. | |
# That's dense enough, with little enough change, that it clutters graphs. | |
# Let's instead skip through the historical scans and take a subset, | |
# e.g. every 500th data point gets graphed. | |
scan_data_density = 500 | |
# Retrieve JSON blob from API | |
all_sites_request = requests.get(url) | |
all_sites = all_sites_request.json()['results'] | |
# Shove JSON into dataframe (not using for now) | |
df = pandas.io.json.json_normalize(all_sites) | |
# print(df) | |
# Cheat and use simple data structures for munging data | |
scores = [] | |
timestamps = [] | |
# Loop through all sites referenced by the API. | |
# Extract the domain mentioned, then make a second API call | |
# to get historical data per domain (perhaps API v2 should | |
# consolidate the two-step process, to make the API more straightforward). | |
total_num_sites = len(all_sites) | |
for i, site in enumerate(all_sites): | |
# Extract target domain info list of domains. | |
domain = site['domain'] | |
print("Evaluating {} ({}/{})".format(domain, i, total_num_sites)) | |
scans_url = site_url.format(domain) | |
site_request = requests.get(scans_url) | |
site_scan_results = site_request.json()['results'] | |
# Second two in the API process: dig deeply on one domain | |
print("Analyzing historical data for {}".format(domain)) | |
# Too much scan data... let's filter some out. | |
for scan in site_scan_results[0:-1:scan_data_density]: | |
timestamp = scan['timestamp'] | |
timestamps.append(timestamp) | |
score = scan['score'] | |
scores.append(score) | |
# Example URL for one org: | |
# https://securethe.news/api/v1/sites/slate.com/scans/?limit=100&offset=100 | |
# Graph that business! | |
print("Generating pyplot scatter chart...") | |
plt.scatter(timestamps, scores) | |
plt.title("SecureTheNews") | |
plt.xlabel("Timestamp") | |
plt.ylabel("Numeric grade") | |
plt.show() | |
# Try a stacked bar chart. Should show upward shift in grade distribution | |
# over time. | |
# TODO: We'll need to chunk the x-axis into buckets, though, presumably | |
# months, in order to stack the results. | |
print("Generating pyplot stacked bar chart...") | |
plt.bar(timestamps, scores) | |
plt.title("SecureTheNews") | |
plt.xlabel("Timestamp") | |
plt.ylabel("Numeric grade") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment