jennyd/analyse-topics.py

## analyse-topics.py
#!/usr/bin/env python

import csv
import urlparse
from collections import Counter


'''Analyse data from govuk-delivery's database to find out which URLs exist as
subscription topics and which query params they use. This will help us work out
whether all Whitehall content is already tagged to enough things in the links
hash for us to model these subscriptions in email-alert-api and still be able
to match all relevant content to them.

We haven't yet cleaned up topics with no subscribers or ones which have never
sent an email - that may shrink the list of query params.
'''


# Export govuk-delivery's database to CSV in dev:
# mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id
with open('govuk-delivery-topics.csv') as f:
    reader = csv.DictReader(f)
    topics = [row for row in reader]

urls = [row['_id'] for row in topics]


path_segments = [
    'announcements',
    'feed',
    'ministers',
    'organisations',
    'people',
    'policies',
    'publications',
    'statistics',
    'topical-events',
    'topics', # policy areas
    'world'
]


def extract_query_params(url_list):
    return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list]


def query_param_keys(query_params_list):
    return [pair[0] for params in query_params_list for pair in params]


def comment(key):
    '''Return an explanatory comment for some keys.

    They have out-of-date names or their meanings are otherwise unclear.
    '''
    if key == 'topics[]':
        return ' # policy areas'
    elif key == 'departments[]':
        return ' # organisations'
    elif key in ('announcement_filter_option', 'publication_filter_option'):
        return ' # some kind of format'
    elif key == 'relevant_to_local_government':
        return ' # boolean (1)'
    elif key == 'official_document_status':
        return ' # act_papers_only/command_papers_only/command_and_act_papers'
    else:
        return ''


base_url = 'https://www.gov.uk/government/'


for segment in path_segments:
    relevant_urls = [url for url in urls if url.startswith(base_url + segment)]
    print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment)
    print 'Query param keys and usage counts:'
    keys = query_param_keys(extract_query_params(relevant_urls))
    counter = Counter(keys)
    for key, count in counter.items():
        print '    {}: {}{}'.format(key, count, comment(key))
    print ''


other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)]
print 'Other URLs:'
for url in other_urls:
    print url
print '\n'


print 'Values for *_filter_options:'
filter_option_values = []
all_query_params = extract_query_params(urls)
for url_params in all_query_params:
    for key, value in url_params:
        if key in ('announcement_filter_option', 'publication_filter_option'):
            filter_option_values.append(value)

print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values)))
for v in set(filter_option_values):
    print '  {}'.format(v)
print ''
	#!/usr/bin/env python

	import csv
	import urlparse
	from collections import Counter


	'''Analyse data from govuk-delivery's database to find out which URLs exist as
	subscription topics and which query params they use. This will help us work out
	whether all Whitehall content is already tagged to enough things in the links
	hash for us to model these subscriptions in email-alert-api and still be able
	to match all relevant content to them.

	We haven't yet cleaned up topics with no subscribers or ones which have never
	sent an email - that may shrink the list of query params.
	'''


	# Export govuk-delivery's database to CSV in dev:
	# mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id
	with open('govuk-delivery-topics.csv') as f:
	reader = csv.DictReader(f)
	topics = [row for row in reader]

	urls = [row['_id'] for row in topics]


	path_segments = [
	'announcements',
	'feed',
	'ministers',
	'organisations',
	'people',
	'policies',
	'publications',
	'statistics',
	'topical-events',
	'topics', # policy areas
	'world'
	]


	def extract_query_params(url_list):
	return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list]


	def query_param_keys(query_params_list):
	return [pair[0] for params in query_params_list for pair in params]


	def comment(key):
	'''Return an explanatory comment for some keys.

	They have out-of-date names or their meanings are otherwise unclear.
	'''
	if key == 'topics[]':
	return ' # policy areas'
	elif key == 'departments[]':
	return ' # organisations'
	elif key in ('announcement_filter_option', 'publication_filter_option'):
	return ' # some kind of format'
	elif key == 'relevant_to_local_government':
	return ' # boolean (1)'
	elif key == 'official_document_status':
	return ' # act_papers_only/command_papers_only/command_and_act_papers'
	else:
	return ''


	base_url = 'https://www.gov.uk/government/'


	for segment in path_segments:
	relevant_urls = [url for url in urls if url.startswith(base_url + segment)]
	print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment)
	print 'Query param keys and usage counts:'
	keys = query_param_keys(extract_query_params(relevant_urls))
	counter = Counter(keys)
	for key, count in counter.items():
	print ' {}: {}{}'.format(key, count, comment(key))
	print ''


	other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)]
	print 'Other URLs:'
	for url in other_urls:
	print url
	print '\n'


	print 'Values for *_filter_options:'
	filter_option_values = []
	all_query_params = extract_query_params(urls)
	for url_params in all_query_params:
	for key, value in url_params:
	if key in ('announcement_filter_option', 'publication_filter_option'):
	filter_option_values.append(value)

	print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values)))
	for v in set(filter_option_values):
	print ' {}'.format(v)
	print ''