Skip to content

Instantly share code, notes, and snippets.

@paralax
Last active January 26, 2020 22:39
Show Gist options
  • Save paralax/93d781621f54dcc87a6e to your computer and use it in GitHub Desktop.
Save paralax/93d781621f54dcc87a6e to your computer and use it in GitHub Desktop.
quick and dirty skim over the list of CA breach notices
#!/usr/bin/env python
from collections import Counter
import itertools
import urllib
from BeautifulSoup import BeautifulSoup
import timestring
URL="http://oag.ca.gov/ecrime/databreach/list"
html = urllib.urlopen(URL).read()
soup = BeautifulSoup(html)
trs = soup.findAll('tr')
trs = filter(lambda x: x.attrs != [], trs)
# -1 is date reported
# -2 is date breached, if known (or n/a)
dates = [ x.findAll('td')[-2].text.split(',') for x in trs ]
dates = filter(lambda x: x != 'n/a', list(itertools.chain.from_iterable(dates)))
# counts by year
print Counter([ x.year for x in map(timestring.Date, dates) ])
# most common reporting orgs
print Counter([ x.findAll('td')[0].text for x in trs ]).most_common()
"""
breaches by year: {2014: 180, 2013: 155, 2012: 146, 2015: 32, 2011: 26, 2010: 3, 2007: 1, 2009: 1}
most common orgs breached:
(u'American Express Travel Related Services Company, Inc and /or its Affiliates (\u201cAXP\u201d)',
61),
(u'Discover Financial Services', 12),
(u'Massachusetts Mutual Life Insurance Company', 5),
(u'Yolo Federal Credit Union', 3),
(u'California Department of Public Health', 3),
(u'East West Bank', 3),
(u'California Department of Corrections and Rehabilitation', 3)
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment