Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Reduces (Amended) HathiTrust Fiction Metadata to the Paraments of the BSPF Data
# -*- coding: utf-8 -*-
# Extract and summarize the gender breakdown for data comparable to
# that reported by Raven et al in *The English Novel 1770-1829: A
# Bibliographical Survey of Prose Fiction Published in the British Isles*
# As a practical matter, this means:
# - works published between 1770 and 1830
# - published in England or Scotland or Ireland
# - individual works (remove duplicates, and count multivol works only once)
import csv, sys
import re
import operator
import string
puncutationRegex = re.compile('[%s]' % re.escape(string.punctuation))
BritishIsles = [
'enk', # England
'stk', # Scotland
'ie', # Ireland
'wlk', # Wales
def fieldsToWorkID(author, title):
Accepts author, title, and volume information, and returns a
string that attempts to capture this information in a simplified,
modified version. (Lower cased, puncutation removed, etc).
idstring = puncutationRegex.sub('', author) + '-' + puncutationRegex.sub('', title)
idstring = '_'.join(idstring.split())
return idstring.lower()
def main(filename='fiction_metadata-amended.csv'):
vols = []
# Dictionary with years and keys, and lists of genders for values
works = {}
f = open(filename, 'rt')
reader = csv.DictReader(f)
for row in reader:
year = row['date']
author = row['author']
title = row['title']
htid = row['htid']
gender = row['gender']
place = row['place']
# Conditions on whether we'll count a work:
# - year of publication (1770-1830)
# - place ('in BritishIsles; heh)
# - does not have WORKS in title; this is an attempt to
# prevent counting "Collected Works of " and similar,
# and so restrict ourselves (like the the BSPF) to "new"
# works.
if ((int(year) >= 1770 and int(year) <= 1830) and
(place in BritishIsles) and
('works' not in title.lower()) and
('novels' not in title.lower())):
volID = fieldsToWorkID(author,title)
if volID not in vols:
if year in works:
works[year] = [gender]
writer = csv.writer(sys.stdout)
for year in sorted(works.keys()):
output = [year]
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment