Create a gist now

Instantly share code, notes, and snippets.

This script was designed to read through a text file of Samuel Richardson's Collection of the Moral and Instructive Sentiments [1755] digitized by Eighteenth Century Collections Online and the Text Creation Partnership.
import re, sys
from collections import defaultdict
romanNum = {'i':1, 'ii':2, 'iii':3, 'iv':4, 'v':5, 'vi':6, 'vii':7, 'viii':8}
def Roman(num):
"""Hack for dealing with volume numbers less than nine"""
if num.lower() in romanNum:
return romanNum[num.lower()]
f1 = open(sys.argv[1], 'r') # Read in CMIS file
f2 = open(sys.argv[2], 'r') # Read in Table of Contents file
topics = [line.strip() for line in f2 if len(line) > 3] # Create array of topics
startPage = {1:0, 2:348, 3:703, 4:1055, 5:1440, 6:1798, 7:2229, 8:2671} # Convert pagination
characters = defaultdict(lambda: defaultdict(lambda: 0)) # Key: character name; Value: topics associated
frequency = defaultdict(lambda:0) # Key: page number; Value: number of citations
location = {} # Key: topic name; Value: page numbers of associated citations
counts = defaultdict(lambda: defaultdict(lambda: 0)) # Stores most popular passages
counter, total, topic = 0, 0, '' # Increment when new topic reached; 'total' stores cumulative citations
for line in f1: # Main code: loop over the datafile
if line.strip() in topics: # Are we at a new topic?
topic = line.strip() # Store topic in variable for easy referencing
counter += 1 # Increment
location[topic] = [counter, []] # Create dictionary key:value pair
elif re.search(r'(ibid|[iv]+\..*(?=\[))', line): # Regex finds roman numerals NOT inside brackets
if re.search(r'belf', line.lower()): characters['Belford'][topic] += 1
elif re.search(r'lovel', line.lower()): characters['Lovelace'][topic] += 1
elif re.search(r'clarissa', line.lower()): characters['Clarissa'][topic] += 1
elif re.search(r'miss howe', line.lower()): characters['Miss Howe'][topic] += 1
if re.search(r'[iv]+\..*(?=\[)', line):
citation = re.search(r'[iv]+\..*(?=\[)', line)
process = [x for x in re.split('\W', citation.group()) if re.match('\d|[iv]+', x)]
for i in process:
if i in romanNum:
volume = startPage[Roman(i)]
elif re.match('\d', i):
page = volume + int(i)
location[topic][1].append(page)
frequency[page] += 1
elif re.search(r'ibid', line):
location[topic][1].append(location[topic][1][-1])
frequency[location[topic][1][-1]] += 1
for i in sorted(frequency, key=frequency.get, reverse=True):
print 'Page: ' + str(i) + ', Citations: ' + str(frequency[i])
for key in location:
location[key].insert(0, len(location[key][1]))
for i in sorted(location, key=location.get, reverse=True):
print (location[i][1], i, location[i][0])
for key in frequency:
if frequency[key] > 6:
counts[key, frequency[key]]
for i in location:
for j in location[i][2]:
if key == j:
counts[key, frequency[key]][i] += 1
for key in frequency:
total += frequency[key]
print 'Total number of topics: ' + str(len(topics))
print 'Total number of citations: ' + str(total)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment