Create a gist now

Instantly share code, notes, and snippets.

Reduces (Amended) HathiTrust Fiction Metadata to the Paraments of the BSPF Data
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Extract and summarize the gender breakdown for data comparable to
# that reported by Raven et al in *The English Novel 1770-1829: A
# Bibliographical Survey of Prose Fiction Published in the British Isles*
# As a practical matter, this means:
# - works published between 1770 and 1830
# - published in England or Scotland or Ireland
# - individual works (remove duplicates, and count multivol works only once)
import csv, sys
import re
import operator
import string
puncutationRegex = re.compile('[%s]' % re.escape(string.punctuation))
BritishIsles = [
'enk', # England
'stk', # Scotland
'ie', # Ireland
'wlk', # Wales
]
def fieldsToWorkID(author, title):
"""
Accepts author, title, and volume information, and returns a
string that attempts to capture this information in a simplified,
modified version. (Lower cased, puncutation removed, etc).
"""
idstring = puncutationRegex.sub('', author) + '-' + puncutationRegex.sub('', title)
idstring = '_'.join(idstring.split())
return idstring.lower()
def main(filename='fiction_metadata-amended.csv'):
vols = []
# Dictionary with years and keys, and lists of genders for values
works = {}
f = open(filename, 'rt')
try:
reader = csv.DictReader(f)
for row in reader:
year = row['date']
author = row['author']
title = row['title']
htid = row['htid']
gender = row['gender']
place = row['place']
# Conditions on whether we'll count a work:
# - year of publication (1770-1830)
# - place ('in BritishIsles; heh)
# - does not have WORKS in title; this is an attempt to
# prevent counting "Collected Works of " and similar,
# and so restrict ourselves (like the the BSPF) to "new"
# works.
if ((int(year) >= 1770 and int(year) <= 1830) and
(place in BritishIsles) and
('works' not in title.lower()) and
('novels' not in title.lower())):
volID = fieldsToWorkID(author,title)
if volID not in vols:
vols.append(volID)
if year in works:
works[year].append(gender)
else:
works[year] = [gender]
finally:
f.close()
writer = csv.writer(sys.stdout)
writer.writerow(['year','totalWorks','male','female','undetected','namemissing'])
for year in sorted(works.keys()):
output = [year]
output.append(len(works[year]))
output.append(works[year].count('male'))
output.append(works[year].count('female'))
output.append(works[year].count('undetected'))
output.append(works[year].count('namemissing'))
writer.writerow(output)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment