rayvoelker/ezproxy_log_analysis.py

## ezproxy_log_analysis.py
import re
import csv
from dateutil import parser

"""Converts the (apache style) ezproxy log file into .csv files for easier analysis.

Current EZproxy config for log output:
(https://help.oclc.org/Library_Management/EZproxy/Configure_resources/LogFormat)

LogFormat %h %{ezproxy-session}i %u %t "%r" %s %b

EZproxy log directives:
%h 	132.174.1.1 	The IP address of the Host accessing EZproxy
%{ezproxy-session}i EZproxy identifier for the user's current session
%u 	Username used to log into EZproxy if Option LogUser appears in config.txt; session identifier if Option LogSession appears in config.txt; - otherwise. To log both username and session, add only Option LogUser to config.txt, then use %u for the username and %{ezproxy-session}i for the session identifier.
%t 	Date/time of request; may also appear as %{format}t to specify a strftime time format.
%r 	Complete request (e.g. GET http://www.somedb.com HTTP/1.0)
%s 	HTTP numeric status of request
%b 	Number of bytes transferred

"""

# Test log file input:
# test = '192.168.1.1 bE7blQ12345 12345678 [20/May/2019:00:00:18 -0400] "GET https://infoweb.newsbank.com:443/apps/news/results?page=17&p=WORLDNEWS&t=country%3AUSA%21USA&sort=YMD_date%3AA&fld-base-0=alltext&maxresults=20&val-base-0=Huma%20Abedin HTTP/1.1" 200 230582'
# test_expected_result = list([('192.168.1.1',
# 	'bE7blQ12345',
# 	'12345678',
# 	'20/May/2019:00:00:18 -0400',
# 	'GET https://infoweb.newsbank.com:443/apps/news/results?page=17&p=WORLDNEWS&t=country%3AUSA%21USA&sort=YMD_date%3AA&fld-base-0=alltext&maxresults=20&val-base-0=Huma%20Abedin HTTP/1.1',
# 	'200',
# 	'230582'
# )])

ezproxy_regex = re.compile(
	r"^([^\s]{1,})\s{1}"		# first capture group:	%h IP address
	r"([^\s]{1,})\s{1}"			# second capture group:	%{ezproxy-session}i users current session
	r"([^\s]{1,})\s{1}"			# third capture group:	%u 	Username used to log into EZproxy
	r"(?:\[)(.*?)(?:\])\s{1}"	# fourth capture group:	%t 	Date/time of request
	r"(?:\")(.*?)(?:\")\s{1}"	# fifth capture group:	%r 	Complete request
	r"([^\s]{1,})\s{1}"			# sixth capture group:	%s 	HTTP numeric status
	r"([^\s]{1,})"				# seventh capture group:%b 	Number of bytes transferred
)

# text file contains the exproxy-formatted log data as defined above
with open('data.txt', 'rt') as data_input:

# csv file for output
	with open('temp.csv', 'w') as csvfile:
		csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL, dialect='excel')
		# write the csv header
		csv_writer.writerow(('ip', 'ezproxy_session', 'patron_username', 'iso_timestamp', 'completed_request', 'http_status', 'num_bytes'))

		for line in data_input:
			x = ezproxy_regex.search(line)
			if (x != None):
				x_groups = list(x.groups())
				# convert the time string to ISO format (replace the first ':' with a ' ')
				x_groups[3] = parser.parse(x_groups[3].replace(':', ' ', 1)).isoformat()
				csv_writer.writerow(x_groups)

"""
After exporting this log data, it was useful to load the .csv directly into a sqlite database table (named "temp" in the examples below).

The following queries on the table are useful for determining usage during time periods / by user

```sql
-- find sum of data downloaded by date, ezproxy_session, and patron_username
-- order the results by date, and amount downloaded per aggregated group
SELECT
date(iso_timestamp) as date,
ezproxy_session,
patron_username,
(SUM(num_bytes) / 1048576) AS sum_mebibyte

FROM
temp

GROUP BY
date,
ezproxy_session,
patron_username

ORDER BY
date,
sum_mebibyte DESC
```


```sql
-- target a specific resource name and specific date range
SELECT
temp.patron_username,
(SUM(temp.num_bytes) / 1048576) AS sum_mebibyte

FROM
temp
WHERE
temp.completed_request LIKE '%referenceusa%'
AND date(temp.iso_timestamp) = '2019-05-23'

GROUP BY
temp.patron_username

ORDER BY
sum_mebibyte DESC
```

"""
	import re
	import csv
	from dateutil import parser

	"""Converts the (apache style) ezproxy log file into .csv files for easier analysis.

	Current EZproxy config for log output:
	(https://help.oclc.org/Library_Management/EZproxy/Configure_resources/LogFormat)

	LogFormat %h %{ezproxy-session}i %u %t "%r" %s %b

	EZproxy log directives:
	%h 132.174.1.1 The IP address of the Host accessing EZproxy
	%{ezproxy-session}i EZproxy identifier for the user's current session
	%u Username used to log into EZproxy if Option LogUser appears in config.txt; session identifier if Option LogSession appears in config.txt; - otherwise. To log both username and session, add only Option LogUser to config.txt, then use %u for the username and %{ezproxy-session}i for the session identifier.
	%t Date/time of request; may also appear as %{format}t to specify a strftime time format.
	%r Complete request (e.g. GET http://www.somedb.com HTTP/1.0)
	%s HTTP numeric status of request
	%b Number of bytes transferred

	"""

	# Test log file input:
	# test = '192.168.1.1 bE7blQ12345 12345678 [20/May/2019:00:00:18 -0400] "GET https://infoweb.newsbank.com:443/apps/news/results?page=17&p=WORLDNEWS&t=country%3AUSA%21USA&sort=YMD_date%3AA&fld-base-0=alltext&maxresults=20&val-base-0=Huma%20Abedin HTTP/1.1" 200 230582'
	# test_expected_result = list([('192.168.1.1',
	# 'bE7blQ12345',
	# '12345678',
	# '20/May/2019:00:00:18 -0400',
	# 'GET https://infoweb.newsbank.com:443/apps/news/results?page=17&p=WORLDNEWS&t=country%3AUSA%21USA&sort=YMD_date%3AA&fld-base-0=alltext&maxresults=20&val-base-0=Huma%20Abedin HTTP/1.1',
	# '200',
	# '230582'
	# )])

	ezproxy_regex = re.compile(
	r"^([^\s]{1,})\s{1}" # first capture group: %h IP address
	r"([^\s]{1,})\s{1}" # second capture group: %{ezproxy-session}i users current session
	r"([^\s]{1,})\s{1}" # third capture group: %u Username used to log into EZproxy
	r"(?:\[)(.*?)(?:\])\s{1}" # fourth capture group: %t Date/time of request
	r"(?:\")(.*?)(?:\")\s{1}" # fifth capture group: %r Complete request
	r"([^\s]{1,})\s{1}" # sixth capture group: %s HTTP numeric status
	r"([^\s]{1,})" # seventh capture group:%b Number of bytes transferred
	)

	# text file contains the exproxy-formatted log data as defined above
	with open('data.txt', 'rt') as data_input:

	# csv file for output
	with open('temp.csv', 'w') as csvfile:
	csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL, dialect='excel')
	# write the csv header
	csv_writer.writerow(('ip', 'ezproxy_session', 'patron_username', 'iso_timestamp', 'completed_request', 'http_status', 'num_bytes'))

	for line in data_input:
	x = ezproxy_regex.search(line)
	if (x != None):
	x_groups = list(x.groups())
	# convert the time string to ISO format (replace the first ':' with a ' ')
	x_groups[3] = parser.parse(x_groups[3].replace(':', ' ', 1)).isoformat()
	csv_writer.writerow(x_groups)

	"""
	After exporting this log data, it was useful to load the .csv directly into a sqlite database table (named "temp" in the examples below).

	The following queries on the table are useful for determining usage during time periods / by user

	```sql
	-- find sum of data downloaded by date, ezproxy_session, and patron_username
	-- order the results by date, and amount downloaded per aggregated group
	SELECT
	date(iso_timestamp) as date,
	ezproxy_session,
	patron_username,
	(SUM(num_bytes) / 1048576) AS sum_mebibyte

	FROM
	temp

	GROUP BY
	date,
	ezproxy_session,
	patron_username

	ORDER BY
	date,
	sum_mebibyte DESC
	```


	```sql
	-- target a specific resource name and specific date range
	SELECT
	temp.patron_username,
	(SUM(temp.num_bytes) / 1048576) AS sum_mebibyte

	FROM
	temp
	WHERE
	temp.completed_request LIKE '%referenceusa%'
	AND date(temp.iso_timestamp) = '2019-05-23'

	GROUP BY
	temp.patron_username

	ORDER BY
	sum_mebibyte DESC
	```

	"""