c-forster/extractComparisonSetForBSPF.py

## extractComparisonSetForBSPF.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Extract and summarize the gender breakdown for data comparable to
# that reported by Raven et al in *The English Novel 1770-1829: A
# Bibliographical Survey of Prose Fiction Published in the British Isles*
# As a practical matter, this means:
# - works published between 1770 and 1830
# - published in England or Scotland or Ireland
# - individual works (remove duplicates, and count multivol works only once)

import csv, sys
import re
import operator
import string

puncutationRegex = re.compile('[%s]' % re.escape(string.punctuation))

BritishIsles = [
    'enk', # England
    'stk', # Scotland
    'ie',  # Ireland
    'wlk', # Wales
]


def fieldsToWorkID(author, title):
    """
    Accepts author, title, and volume information, and returns a
    string that attempts to capture this information in a simplified,
    modified version. (Lower cased, puncutation removed, etc).
    """
    idstring = puncutationRegex.sub('', author) + '-' + puncutationRegex.sub('', title)

    idstring = '_'.join(idstring.split())

    return idstring.lower()

def main(filename='fiction_metadata-amended.csv'):
    vols = []

    # Dictionary with years and keys, and lists of genders for values
    works = {}

    f = open(filename, 'rt')
    try:
        reader = csv.DictReader(f)
        for row in reader:
            year   = row['date']
            author = row['author']
            title  = row['title']
            htid   = row['htid']
            gender = row['gender']
            place  = row['place']

            # Conditions on whether we'll count a work:
            # - year of publication (1770-1830)
            # - place ('in BritishIsles; heh)
            # - does not have WORKS in title; this is an attempt to
            #   prevent counting "Collected Works of " and similar,
            #   and so restrict ourselves (like the the BSPF) to "new"
            #   works.

            if ((int(year) >= 1770 and int(year) <= 1830) and
                (place in BritishIsles) and
                ('works' not in title.lower()) and
                ('novels' not in title.lower())):

                volID  = fieldsToWorkID(author,title)

                if volID not in vols:

                    vols.append(volID)
                    if year in works:
                        works[year].append(gender)
                    else:
                        works[year] = [gender]

    finally:
        f.close()

    writer = csv.writer(sys.stdout)
    writer.writerow(['year','totalWorks','male','female','undetected','namemissing'])
    for year in sorted(works.keys()):
        output = [year]
        output.append(len(works[year]))
        output.append(works[year].count('male'))
        output.append(works[year].count('female'))
        output.append(works[year].count('undetected'))
        output.append(works[year].count('namemissing'))

        writer.writerow(output)

if __name__ == "__main__":
    main()
	#!/usr/bin/python
	# -- coding: utf-8 --
	# Extract and summarize the gender breakdown for data comparable to
	# that reported by Raven et al in *The English Novel 1770-1829: A
	# Bibliographical Survey of Prose Fiction Published in the British Isles*
	# As a practical matter, this means:
	# - works published between 1770 and 1830
	# - published in England or Scotland or Ireland
	# - individual works (remove duplicates, and count multivol works only once)

	import csv, sys
	import re
	import operator
	import string

	puncutationRegex = re.compile('[%s]' % re.escape(string.punctuation))

	BritishIsles = [
	'enk', # England
	'stk', # Scotland
	'ie', # Ireland
	'wlk', # Wales
	]


	def fieldsToWorkID(author, title):
	"""
	Accepts author, title, and volume information, and returns a
	string that attempts to capture this information in a simplified,
	modified version. (Lower cased, puncutation removed, etc).
	"""
	idstring = puncutationRegex.sub('', author) + '-' + puncutationRegex.sub('', title)

	idstring = '_'.join(idstring.split())

	return idstring.lower()

	def main(filename='fiction_metadata-amended.csv'):
	vols = []

	# Dictionary with years and keys, and lists of genders for values
	works = {}

	f = open(filename, 'rt')
	try:
	reader = csv.DictReader(f)
	for row in reader:
	year = row['date']
	author = row['author']
	title = row['title']
	htid = row['htid']
	gender = row['gender']
	place = row['place']

	# Conditions on whether we'll count a work:
	# - year of publication (1770-1830)
	# - place ('in BritishIsles; heh)
	# - does not have WORKS in title; this is an attempt to
	# prevent counting "Collected Works of " and similar,
	# and so restrict ourselves (like the the BSPF) to "new"
	# works.

	if ((int(year) >= 1770 and int(year) <= 1830) and
	(place in BritishIsles) and
	('works' not in title.lower()) and
	('novels' not in title.lower())):

	volID = fieldsToWorkID(author,title)

	if volID not in vols:

	vols.append(volID)
	if year in works:
	works[year].append(gender)
	else:
	works[year] = [gender]

	finally:
	f.close()

	writer = csv.writer(sys.stdout)
	writer.writerow(['year','totalWorks','male','female','undetected','namemissing'])
	for year in sorted(works.keys()):
	output = [year]
	output.append(len(works[year]))
	output.append(works[year].count('male'))
	output.append(works[year].count('female'))
	output.append(works[year].count('undetected'))
	output.append(works[year].count('namemissing'))

	writer.writerow(output)

	if __name__ == "__main__":
	main()