Created
May 16, 2013 12:25
-
-
Save reagle/5591363 to your computer and use it in GitHub Desktop.
The program I used to analyze the balance of gender coverage in Wikipedia and Britannica. The gender guessing heuristics might be useful to others.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2.6 | |
# -*- coding: utf-8 -*- | |
# (c) Copyright 2011-2013 by Joseph Reagle | |
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html> | |
import codecs | |
from difflib import SequenceMatcher | |
from cfiledict import FileDict # compressed FileDict | |
#from filedict import FileDict # http://erezsh.wordpress.com/2009/05/24/filedict-a-persistent-dictionary-in-python/ | |
import logging | |
import lxml | |
import lxml.etree | |
from lxml.html.clean import clean_html | |
from lxml.html import builder as E # http://effbot.org/zone/element-builder.htm | |
from lxml.html import fragment_fromstring # http://codespeak.net/lxml/lxmlhtml.html | |
from StringIO import StringIO | |
from optparse import OptionParser | |
import random | |
import re | |
from rpy import r | |
import simplejson | |
import sys | |
import time | |
import unicodedata | |
import urllib | |
import urllib2 | |
from web_little import get_HTML # http://bitbucket.org/reagle/thunderdell/src/tip/web.py | |
from os import environ | |
HOME = environ['HOME'] | |
#import socket | |
#socket.setdefaulttimeout(20) | |
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) | |
############### | |
# Exceptions | |
############### | |
class FailedGet(Exception): | |
'''Exception for when one can't return something.''' | |
pass | |
############### | |
# Maths | |
############### | |
def L_ratio(seq1, seq2): | |
'''Sugar function for difflib.SequenceMatcher.ratio()''' | |
dbg(" comparing '%s' w/ '%s'" %(seq1, seq2)) | |
return round(SequenceMatcher(None, seq1, seq2).ratio(), 2) | |
############### | |
# String functions | |
############### | |
from htmlentitydefs import name2codepoint | |
name2codepoint['#39'] = 39 # python 2.5.2 doesn't have apostrophe | |
def unescape(s): | |
"""Unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml.""" | |
return re.sub('&(%s);' % '|'.join(name2codepoint), | |
lambda m: unichr(name2codepoint[m.group(1)]), s) | |
def strip_accents(text): | |
"""Test if ascii, if not, remove accents. | |
""" | |
#>>> strip_accents(u'nôn-åscîî') # fails because of doctest bug | |
#u'non-ascii' | |
def not_combining(char): | |
return unicodedata.category(char) != 'Mn' | |
try: # test if ascii | |
text.encode('ascii') | |
except UnicodeEncodeError: | |
normalized_text = unicodedata.normalize('NFD', text) | |
result_text = filter(not_combining, normalized_text) | |
else: | |
result_text = text | |
result_text = result_text.replace('-', ' ').replace('"','').strip() | |
return result_text | |
def are_similar(bio, result): | |
"""Compare list title with returned titles for relatedness.""" | |
title = result['titleNoFormatting'].split(' -')[0].strip() # remove encyclopedia | |
for name in bio.name.split(' OR '): | |
if '(' not in name: # if no paren in name, remove paren from title | |
title = title.split('(')[0] | |
if ', ' in title: # Britannica sometimes uses Last, First | |
if len(title.split(',')) == 2: | |
last, first = title.split(', ') | |
title = first + ' ' + last | |
dbg(" comparing %s" %name) | |
name_chars = unescape(strip_accents(name)).lower() | |
title_chars = unescape(strip_accents(title)).lower() | |
name_wrds = name_chars.split() | |
title_wrds = title_chars.split() | |
name_set = set(name_wrds) | |
title_set = set(title_wrds) | |
if (name_set.issubset(title_set) or title_set.issubset(name_set)) and \ | |
(len(title_set) > 1 and 'surname' not in title_set): | |
dbg(" PASSED sup/sub %s" % title_set) | |
return True | |
if len(name_wrds) == 1 or len(title_wrds) == 1: | |
words_threshold = 0.68 # 0.67 | |
chars_threshold = 0.77 # 0.68 | |
else: | |
if len(name_wrds) == 2: | |
words_threshold = 0.62 | |
chars_threshold = 0.71 # 0.69 | |
if len(name_wrds) == 3: | |
words_threshold = 0.62 | |
chars_threshold = 0.70 # 0.69 | |
if len(name_wrds) >= 4: | |
words_threshold = 0.45 # 0.40 | |
chars_threshold = 0.60 | |
if len(name_wrds) >= 5: # large name have titles often out of order | |
name_wrds = sorted(name_wrds) | |
title_wrds = sorted(title_wrds) | |
name_chars = ' '.join(name_wrds) | |
title_chars = ' '.join(title_wrds) | |
words_threshold = 0.81 | |
chars_threshold = 0.80 | |
words_ratio = L_ratio(name_wrds, title_wrds) | |
if words_ratio >= words_threshold: | |
dbg(u" PASSED words %2.2f >= %2.2f '%s'" % ( | |
words_ratio, words_threshold, title_wrds)) | |
return True | |
else: | |
dbg(u" FAILED words %2.2f < %2.2f '%s'" % ( | |
words_ratio, words_threshold, title_wrds)) | |
chars_ratio = L_ratio(name_chars, title_chars) | |
if chars_ratio >= chars_threshold: | |
dbg(u" PASSED chars %2.2f >= %2.2f '%s'" % ( | |
chars_ratio, chars_threshold, title_chars)) | |
return True | |
else: | |
dbg(u" FAILED chars %2.2f < %2.2f '%s'" % ( | |
chars_ratio, chars_threshold, title_chars)) | |
return False | |
def split_name_date(person): | |
"""Some source lists have optional dates for query, split them up""" | |
dbg("person = '%s'" % person) | |
name = born = died = '' | |
if '<' in person: | |
name, date = person.rsplit(' <', 1) | |
born, died = date[:-1].split(' - ') | |
else: | |
name = person | |
if opts.no_dates: | |
born = died = '' | |
return name, born, died | |
############### | |
# Create report | |
############### | |
def create_html_report(source_fn_base, bios, people, people_url): | |
"""Return HTML report of topical encyclopedic converage.""" | |
critical("performing analysis for report") | |
report_f = codecs.open(source_fn_base + '.html', 'w', 'UTF-8', 'replace') | |
if opts.export_csv: | |
csv_f = codecs.open(source_fn_base + '.csv', 'w', 'UTF-8', 'replace') | |
DOCTYPE = u'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">' | |
html = E.HTML( | |
E.HEAD( | |
E.TITLE("%s Biographies" % source_fn_base), | |
E.LINK(rel="stylesheet", type="text/css", | |
href="../../../2001/reagle.css"), | |
E.STYLE( | |
".male {background-color: #f2f2ff}" | |
".female {background-color: #fff2fd}" | |
".unknown {background-color: #f4fff2}", type="text/css") | |
)) | |
# for R dataframe | |
genders = [] | |
wp_sizes = [] | |
eb_sizes = [] | |
for person in people: | |
key, dummy, dummy = split_name_date(person) | |
bio = bios[key] | |
genders.append(bio.gender) | |
wp_sizes.append(bio.wp_wc) | |
eb_sizes.append(bio.eb_wc) | |
# Statistical analysis | |
r.assign('refwork', r.data_frame(gender=genders, wp_size=wp_sizes, eb_size=eb_sizes)) | |
SRCmale = r('sum(refwork$gender == "male")') | |
SRCfem = r('sum(refwork$gender == "female")') | |
SRCun = r('sum(refwork$gender == "unknown")') | |
WPmale = r('sum(refwork$gender == "male" & refwork$wp.size != 0)') | |
WPfem = r('sum(refwork$gender == "female" & refwork$wp.size != 0)') | |
EBmale = r('sum(refwork$gender == "male" & refwork$eb.size != 0)') | |
EBfem = r('sum(refwork$gender == "female" & refwork$eb.size != 0)') | |
SRCgendered = r('sum(refwork$gender != "unknown")') | |
WPgendered = r('sum(refwork$gender != "unknown" & refwork$wp.size != 0)') | |
EBgendered = r('sum(refwork$gender != "unknown" & refwork$eb.size != 0)') | |
inSRC = r('length(refwork$gender)') | |
missingEB = r('sum(refwork$eb.size == 0)') | |
missingWP = r('sum(refwork$wp.size == 0)') | |
missingBoth = r('sum(refwork$eb.size == 0 & refwork$wp.size == 0)') | |
wp_existing_summary = ' '.join((' %s %5.0f ' % (k, v) | |
for k,v in r('summary(refwork$wp.size[refwork$wp.size != 0], digits=5)').items())) | |
eb_existing_summary = ' '.join((' %s %5.0f ' % (k, v) | |
for k,v in r('summary(refwork$eb.size[refwork$eb.size != 0], digits=5)').items())) | |
wp_median = r('median(refwork$wp.size[refwork$wp.size != 0])') | |
eb_median = r('median(refwork$eb.size[refwork$eb.size != 0])') | |
mutual_wp_median = r('median(refwork$wp.size[refwork$wp.size != 0 & ' | |
'refwork$eb.size != 0])') | |
mutual_eb_median = r('median(refwork$eb.size[refwork$eb.size != 0 & ' | |
'refwork$wp.size != 0])') | |
body = E.BODY( | |
E.H1("%s Biographies" % source_fn_base) | |
) | |
html.append(body) | |
body.extend(( | |
E.P( | |
"Missing: WP = %d ; EB = %d ; neither = %d . " | |
%(missingWP, missingEB, missingBoth)), | |
) | |
) | |
if opts.gender: | |
gender_text = ( | |
"<p>Of %d entries: I guess that %d are <span class='female'>female</span>, " | |
"%d are <span class='male'>male</span> " | |
"and %d are <span class='unknown'>unknown</span>. " | |
"That is, females are %0.2f of the gender-known population. " | |
"Of the Wikipedia articles, females are %0.2f (%d/(%d+%d)); " | |
"and %0.2f (%d/(%d+%d)) at Britannica. </p>" | |
% ( | |
inSRC, SRCfem, | |
SRCmale, | |
SRCun, | |
float(SRCfem)/(SRCgendered), | |
float(WPfem)/(WPgendered), WPfem, WPmale, WPfem, | |
float(EBfem)/(EBgendered), EBfem, EBmale, EBfem | |
)) | |
body.append(fragment_fromstring(gender_text)) | |
body.extend(( | |
E.P( | |
"Existing median word count (for articles that exist in each work alone): WP = %0.0f ; EB = %0.0f . " | |
"WP median article size is roughly %0.1f times larger." | |
%(wp_median, eb_median, wp_median/eb_median) ), | |
E.P("Five figure summaries for existing articles."), | |
E.PRE( | |
"WP: %s \n" | |
"EB: %s " | |
%(wp_existing_summary, eb_existing_summary) ), | |
E.P( | |
"Mutual median word count (for articles that exist in both works only):, WP median = %0.0f words, EB = %0.0f . " | |
"Mutual WP median article size is roughly %0.1f times larger." | |
%(mutual_wp_median, mutual_eb_median, mutual_wp_median/mutual_eb_median) ), | |
)) | |
r('png(file="%s.png")' % source_fn_base) # ,width=733,height=550 | |
r('boxplot(refwork$wp.size[refwork$wp.size != 0], refwork$eb.size[refwork$eb.size != 0], ' | |
'names = c("Wikipedia", "Britannica"), ' | |
'main = "Existing article word counts")' ) | |
body.append( | |
E.P( | |
E.IMG(alt="size distribution", src="%s.png" % source_fn_base) | |
) | |
) | |
critical("generating table") | |
table = E.TABLE( | |
E.COL(), E.COL(E.CLASS('col-alt')), E.COL(), | |
E.COL(E.CLASS('col-alt')), E.COL(), | |
E.THEAD(E.TR(E.TH('Name'), | |
E.TH('Wikipedia', width="35%"), E.TH('Words'), | |
E.TH('Britannica', width="35%"), E.TH('Words'))), | |
width="100%", cellpadding="5", border="1") | |
#if opts.export_csv: | |
#csv_f.write('source, name, gender, born, died, list, count \n') | |
for person in people: | |
key, dummy, dummy = split_name_date(person) | |
bio = bios[key] | |
table.append( | |
E.TR( | |
E.TD(E.A(bio.name, href='%s' % (people_url)), # , bio.last_name | |
E.CLASS(bio.gender)), | |
E.TD(E.A(bio.wp_title, href=bio.wp_purl), E.CLASS(bio.gender)), | |
E.TD(str(bio.wp_wc)), # size | |
E.TD(E.A(bio.eb_title, href=bio.eb_url), E.CLASS(bio.gender)), | |
E.TD(str(bio.eb_wc)), # size | |
valign = "center", | |
)) | |
if opts.text_include: | |
table.append( | |
E.TR( | |
E.TD(''), | |
E.TD(colspan='2', *[E.P(p) for p in bio.wp_text.split('\n')]), | |
E.TD(colspan='2', *[E.P(p) for p in bio.eb_text.split('\n')]), | |
valign="top", | |
), | |
) | |
if opts.export_csv: | |
for source in source_fn_base.split('-'): | |
if source.isalpha(): | |
break | |
csv_f.write(u'%s;%s;%s;%s;%s;WP;%s\n' % | |
(source, bio.name, bio.gender, bio.born, bio.died, bio.wp_wc) ) | |
csv_f.write(u'%s;%s;%s;%s;%s;EB;%s\n' % | |
(source, bio.name, bio.gender, bio.born, bio.died, bio.eb_wc) ) | |
body.append(table) | |
report_f.write(DOCTYPE + lxml.html.tostring(html, pretty_print=True)) | |
report_f.close() | |
if opts.export_csv: | |
csv_f.close() | |
############### | |
# Web | |
############### | |
def url_OK(url): | |
'''Check if URL is in right Web space and with textual extension.''' | |
dbg(" testing %s" % url) | |
is_ok = True | |
BAD_EXTENSIONS = ('doc', 'pdf', 'jpg', 'png', 'gif') # Google | |
BAD_WP_NAMESPACES = ('/Wikiquote:', '/Wikisource:') | |
if url.split('.')[-1].lower() in BAD_EXTENSIONS: | |
is_ok = False | |
if 'britannica.com' in url: | |
if '/EBchecked/topic/' not in url: | |
is_ok = False | |
if 'wikipedia.org' in url: | |
if '(disambiguation)' in url: # WP | |
is_ok = False | |
if any([ns in url for ns in BAD_WP_NAMESPACES]): | |
is_ok = False | |
dbg(" url_OK = %s" % is_ok) | |
return is_ok | |
def query_google(query, google_queries, do_refresh, retry_counter=0): | |
"""Return ordered results from Google API via google_queries cache.""" | |
info(" testing google_queries for '%s'" % query) | |
results = google_queries.get(query, None) | |
if do_refresh or (results is None and not opts.cache_only): | |
info(" refresh or g_cache MISS for %s" %query) | |
url = "http://ajax.googleapis.com/ajax/services/" + \ | |
"search/web?start=0&v=1.0&%s" % ( # &rsz=large for 8 results | |
urllib.urlencode({'q': query.encode('utf-8')})) | |
request = urllib2.Request( url, None, {'Referer': 'http://reagle.org/joseph/'}) | |
try: | |
search_results = urllib2.urlopen(request) | |
json = simplejson.loads(search_results.read()) | |
dbg(" responseStatus = %s" %json['responseStatus']) | |
if json['responseStatus'] == 404: | |
raise FailedGet("Google API: %s" %json['responseDetails']) | |
results = json['responseData']['results'] | |
info(" got results") | |
except (urllib2.URLError) as e: # TypeError, | |
info(" retrying query_google after %s" %e) | |
time.sleep((retry_counter * 10 + 5)) # pause before retying | |
results = query_google(query, google_queries, do_refresh, retry_counter + 1) | |
google_queries[query] = results | |
if retry_counter == 2: | |
raise FailedGet("Failed to get after 3 attempts.") | |
if results is None and opts.cache_only: | |
raise FailedGet("No cache result found with cache-only option.") | |
info(" returning results") | |
return results | |
def query_web(url, web_queries, do_refresh): | |
"""Grab Web page (encyclopedic article) from URL via web_queries cache.""" | |
url = url.rsplit('#')[0] # don't bother with fragments | |
info(" testing web_queries for '%s'" % url) | |
dbg(" do_refresh = %s, opts.refresh = %s opts.cache_only = %s" % | |
(do_refresh, opts.refresh, opts.cache_only)) | |
html = web_queries.get(url, None) | |
if do_refresh or (html is None and not opts.cache_only): | |
dbg(" retrieving from Web") | |
time.sleep(random.randint(1, 3)) | |
html, response = get_HTML(url) | |
web_queries[url] = html | |
else: | |
dbg(" found in web_queries cache") | |
if html is None and cache_only: | |
raise FailedGet("No cache result found with cache-only option.") | |
return html | |
def get_text(content_node): | |
'''Return textual content of nodes sans elements.''' | |
content_text = ''.join(content_node.xpath("descendant-or-self::text()")) | |
return content_text | |
def remove_node(node): | |
'''Remove a node from a tree.''' | |
parent = node.getparent() | |
parent.remove(node) | |
def get_wp_text_wc(html): | |
'''Return content of article in textual form without miscellany.''' | |
def trim_wp(node): | |
'''To keep analysis comparable, remove most everything but article prose.''' | |
remove_node(node[0]) # "From Wikipedia, the free encyclopedia" | |
remove_node(node[1]) # Jump to:navigation, | |
try: | |
for n in node.xpath('//table'): | |
remove_node(n) | |
except IndexError: | |
pass | |
EXCLUDED_SECTIONS = ('sSee_also', 'Further_Reading', 'Further_reading', | |
'Notes', 'External_Links', 'References', 'External_links' | |
'Footnote', 'Footnotes', 'Sources', 'Notes_and_sources', | |
'Bibliography', 'Notes_and_references', | |
'Works', 'Publications', | |
'List_of_works', 'Selected_works', 'Novels', | |
'Selected_bibliography', 'Publications_and_speeches', | |
'Records', 'Other_Records', 'Career_statistics','Teams_and_victories', | |
'Professional_wins', | |
'Accomplishments', 'Championships_and_accomplishments', | |
'Records_and_achievements', 'Major_Projects', 'Discography', | |
'Original_Compositions', 'Videography', 'Notable_works', | |
'Filmography', 'Selected filmography', 'Partial_filmography', | |
'Music_videography', 'Books', | |
'Achievements', 'Notable_roles', | |
'Television_appearances', 'In_popular_culture', | |
'Popular_culture', | |
'Titles_and_honours', 'Honours', 'Awards', 'Awards_and_fellowships', | |
'Awards_and_nominations', 'Career_highlights', | |
'Awards_and_honors', 'Recognition', 'Selected_awards', 'Other honors', | |
'Honors_and_awards', 'Awards_and_recognitions', | |
'Titles.2C_styles_and_honours', 'Honorary_degrees', 'Prizes', | |
) | |
for misc in EXCLUDED_SECTIONS: | |
try: | |
misc_node = node.xpath('//span[@id="%s"]/ancestor::h2' % misc)[0] | |
[remove_node(n) for n in misc_node.xpath('./following-sibling::*')] | |
remove_node(misc_node) | |
except IndexError: # in case no node found | |
pass | |
return node | |
html = clean_html(html) | |
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True, | |
remove_pis = True, strip_cdata = True) | |
doc = etree.parse(StringIO(html), html_parser) | |
div = doc.xpath("//div[@id='bodyContent']")[0] | |
#dbg(" div = %s" % ('\n'.join(n.tag for n in div[1:10]))) | |
#dbg(lxml.html.tostring(div, pretty_print=True)) | |
div = trim_wp(div) | |
text = get_text(div) | |
text = text.replace('[edit]', '') | |
return text, len(text.split()) | |
def get_eb_text_wc(html, web_queries, do_recurse, do_refresh): | |
'''Return content of article in textual form without miscellany. | |
Will follow links if necessary since EB pages are dynamic. | |
''' | |
def trim_eb(node): | |
'''Trim material following the citations.''' | |
try: | |
misc_node = node.xpath('//h2[text() = "Citations"]')[0] | |
dbg(" misc_node %s" %misc_node) | |
[remove_node(n) for n in misc_node.xpath('./following::*')] | |
remove_node(misc_node) | |
dbg(" trimmed it!") | |
except IndexError: # in case no node found | |
pass | |
return node | |
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True, | |
remove_pis = True, strip_cdata = True) | |
doc = etree.parse(StringIO(html), html_parser) | |
if do_recurse: # EB articles might have other sections to fetch | |
EXCLUDED_SECTIONS = ( | |
"Major Works", "Additional Reading", "Biographies", | |
"Critical studies", "Related Articles", "Supplemental Information", | |
"Quotations", "Spotlights", "External Web sites", "Citations", | |
"Year in Review Links",) | |
info(" YES recurse") | |
text = '' | |
# Find links to other sections | |
toc_options = doc.xpath("//div[@id='bps-article-toc']/select/option") | |
info("toc_options = %s" % toc_options) | |
toc_options.pop(1) # skip the second since its included in Main | |
dbg(" toc_options %s" %toc_options) | |
for opt in toc_options: | |
opt_url = 'http://www.britannica.com' + opt.get('value') | |
if opt.get('title') not in EXCLUDED_SECTIONS and '#' not in opt_url: | |
dbg(" checking %s" %opt.get('title')) | |
dbg(" opt_url %s" %opt_url) | |
html = query_web(opt_url, web_queries, do_refresh) | |
html_parser = etree.HTMLParser(remove_comments = True) | |
doc = etree.parse(StringIO(html), html_parser) | |
div = doc.xpath('//div[@class="KonaBody"]') # /ancestor::div | |
if div: | |
div = trim_eb(div[0]) | |
text += get_text(div) | |
else: | |
dbg(" didn't find content in %s" %opt.get('title')) | |
dbg(html) | |
else: | |
dbg(" NO recurse") | |
div = doc.xpath('//div[@id="bps-left-article-wrapper"]')[0] | |
div = trim_eb(div) | |
dbg(" div %s" %div) | |
text = get_text(div) | |
return text, len(text.split()) | |
wgCurRevisionId_regexp = re.compile('wgCurRevisionId=(\d+)') | |
oldid_regexp = re.compile('oldid=(\d+)') | |
def get_wp_oldid(wp_html): | |
'''Return oldid for construction of permanent URI.''' | |
wgCurRevisionId = wgCurRevisionId_regexp.search(wp_html) | |
info("wgCurRevisionId.groups() = %s" %wgCurRevisionId.groups()) | |
if wgCurRevisionId: | |
return wgCurRevisionId.groups()[0] | |
else: | |
oldid = oldid_regexp.search(wp_html) | |
if oldid: | |
info("oldid.groups() = %s" %oldid.groups()) | |
return oldid.groups()[0] | |
else: | |
critical("No oldid found") | |
raise FailedGet | |
def get_wp_purl(bio): | |
'''Return formatted permanent URI.''' | |
root_url = u'http://en.wikipedia.org/w/index.php?title=' | |
return root_url + bio.wp_url.split('/')[-1] + '&oldid=' + bio.wp_oldid | |
############### | |
# Gender guessing functions | |
############### | |
he_re = re.compile(r'\b([Hh]is|[Hh]e)\b') | |
she_re = re.compile(r'\b([Hh]er|[Ss]he)\b') | |
def guess_gender_pronouns(text): | |
'''Guess gender based on proportion of pronouns.''' | |
info("guessing gender via pronouns") | |
she = len(she_re.findall(text)) if she_re.findall(text) else 0 | |
he = len(he_re.findall(text)) if he_re.findall(text) else 0 | |
diff = abs(she - he)/(he + she + 0.1) | |
dbg("he = %d, she = %d, diff = %f" %(he, she, diff)) | |
if diff < 0.25: | |
gender = 'unknown' | |
elif she > he: | |
gender = 'female' | |
else: | |
gender = 'male' | |
dbg("gender = %s" %gender) | |
return gender | |
def create_name_dict(fn): | |
'''Utility function to build dictionaries of name frequencies.''' | |
d = {} | |
names = open(fn).readlines() | |
for line in names[1:]: | |
if ' na ' not in line: | |
name, frequency, number, rank = line.split() | |
d[name] = {} | |
d[name]['freq'] = float(frequency) | |
return d | |
FEMALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-female.csv') | |
MALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-male.csv') | |
HONORIFIC_MALE = [honor + ' ' for honor in | |
('Baron', 'Brother', 'Comte', 'Count', 'Duc', 'Duke', 'Earl of', | |
'Father', 'Marquess', 'Marquis', 'Prince', 'Sir', | |
'Viscount', 'Vicomte' )] | |
HONORIFIC_FEMALE = [honor + ' ' for honor in | |
('Baroness', 'Comtesse', 'Countess', 'Dame', | |
'Marchioness', 'Marquise', 'Princess', 'Sister', 'Queen')] | |
def guess_gender_name(name): | |
'''Guess the gender based only on the name using honorifics and | |
statistical tables of name frequencies | |
>>> guess_gender_name('Joseph Reagle') | |
'male' | |
>>> guess_gender_name('Sir Jehne Smith') | |
'male' | |
>>> guess_gender_name('Dame Jijij Foo') | |
'female' | |
''' | |
info("guessing gender via names") | |
gender = 'unknown' | |
if any([honor in name for honor in HONORIFIC_MALE]): | |
gender = 'male' | |
elif any([honor in name for honor in HONORIFIC_FEMALE]): | |
gender = 'female' | |
if gender == 'unknown': | |
name = name.replace('Dr. ', '') | |
given = name.split()[0].upper() | |
if given in FEMALE_NAMES and given in MALE_NAMES: | |
dbg("freqs = %f %f" % (MALE_NAMES[given]['freq'], FEMALE_NAMES[given]['freq']) ) | |
if MALE_NAMES[given]['freq'] > 4 * FEMALE_NAMES[given]['freq']: | |
gender = 'male' | |
elif FEMALE_NAMES[given]['freq'] > 4 * MALE_NAMES[given]['freq']: | |
gender = 'female' | |
else: | |
if given in MALE_NAMES: | |
gender = 'male' | |
if given in FEMALE_NAMES: | |
gender = 'female' | |
info("gender = %s" %gender) | |
return gender | |
############### | |
# Biography class | |
############### | |
class Biography(): | |
name = '' | |
last_name = '' | |
born = '' | |
died = '' | |
gender = 'unknown' | |
wp_title = '' | |
wp_url = '' | |
wp_purl = '' | |
wp_html = '' | |
wp_text = '' | |
wp_wc = 0 | |
wp_ratio = 0 | |
wp_oldid = '' | |
eb_title = '' | |
eb_url = '' | |
eb_html = '' | |
eb_text = '' | |
eb_wc = 0 | |
def __init__(self, name, born = '', died = ''): | |
self.name = name | |
self.last_name = name.split(' OR ')[0].split('(')[0].split()[-1] # use first last name | |
self.born = born | |
self.died = died | |
def __str__(self): | |
#showList = ["a", "b"] | |
showList = sorted(set(self.__dict__)) | |
return ("X(%i):\n" % id(self)) + "\n".join([" %s: %s" % ( | |
key.rjust(8), self.__dict__[key]) for key in showList]) | |
############### | |
# Web scrape and build bios | |
############### | |
def build_bios(people_proj, people): | |
"""Build biography for each person including EB/WP information.""" | |
bios = FileDict(filename='cache/bios-%s' % people_proj + '.db') | |
google_queries = FileDict(filename='cache/google-%s' % people_proj + '.db') | |
web_queries = FileDict(filename='cache/web-%s' % people_proj + '.db') | |
if opts.delete: | |
del bios[opts.delete] | |
return bios | |
CUTOFF = 1900 | |
for person in people: | |
name, born, died = split_name_date(person) | |
critical("** Checking '%s' <%s - %s> **" % (name, born, died)) | |
bio = Biography(name, born, died) | |
if opts.refresh and bio.name in opts.refresh: # so I can pass in more than one name | |
do_refresh = True | |
else: | |
do_refresh = False | |
if name in bios: | |
if opts.fast_cache or opts.cache_only: | |
info(" '%s' is duplicated, skipping" % name) | |
continue | |
else: | |
if opts.cache_only: | |
critical("No cached bio found for '%s'" %name) | |
raise FailedGet | |
info("WIKIPEDIA") | |
site = 'en.wikipedia.org' | |
if born and int(born) < CUTOFF: | |
query = 'site:%s (%s)' % (site, name) | |
else: | |
query = 'site:%s (%s %s)' % (site, name, born) | |
dbg(u" query = %s" % query) | |
results = query_google(query, google_queries, do_refresh) | |
info(" results %s" % [r['titleNoFormatting'] for r in results]) | |
for result in results: | |
if url_OK(result['url']) and are_similar(bio, result): | |
info(" using %s" %result['url']) | |
bio.wp_title = unescape(result['titleNoFormatting'] \ | |
.split(' -')[0]) | |
bio.wp_url = urllib2.unquote(result['url'].encode()).decode( | |
'utf-8', 'replace') | |
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh) | |
#if not opts.ambiguous and \ | |
#('disambigbox' in bio.wp_html or 'setindexbox' in bio.wp_html): | |
DISAMBIG = ('disambigbox', 'setindexbox') | |
if not opts.ambiguous and any([disambig in bio.wp_html for disambig in DISAMBIG]): | |
critical("disambiguating %s at %s" % (bio.wp_title, bio.wp_url)) | |
bio.wp_title = bio.wp_title + ' [disambiguated]' | |
parser_dbox = etree.HTMLParser(remove_comments = True) | |
doc_dbox = etree.parse(StringIO(bio.wp_html), parser_dbox) | |
url_dbox = 'http://en.wikipedia.org' + doc_dbox.xpath( | |
"//div[@id='bodyContent']/ul//a[not(@class='new')]/@href")[0] | |
critical("url_dbox %s" % url_dbox) | |
bio.wp_url = url_dbox | |
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh) | |
#info(" bio.wp_html %s" %bio.wp_html) | |
bio.wp_oldid = get_wp_oldid(bio.wp_html) | |
bio.wp_purl = get_wp_purl(bio) | |
bio.wp_text, bio.wp_wc = \ | |
get_wp_text_wc(bio.wp_html) | |
if opts.gender and bio.gender == 'unknown': | |
bio.gender = guess_gender_pronouns(bio.wp_text) | |
break | |
else: | |
info(" not using %s" %result['url']) | |
info("BRITANNICA") | |
site = 'www.britannica.com' # /EBchecked/topic/ | |
if born and int(born) < CUTOFF: | |
query = 'site:%s (%s)' % (site, name) | |
else: | |
query = 'site:%s (%s %s)' % (site, name, born) | |
info(" query = %s" %query) | |
results = query_google(query, google_queries, do_refresh) | |
info(" results %s" %[r['titleNoFormatting'] for r in results]) | |
for result in results: | |
if url_OK(result['url']) and are_similar(bio, result): | |
info(" using %s" %result['url']) | |
do_recurse = False | |
bio.eb_title = \ | |
unescape(result['titleNoFormatting'].split(' --')[0]) | |
bio.eb_url = urllib2.unquote(result['url'].encode()) | |
if bio.eb_url.split('/')[-3] == 'topic': | |
do_recurse = True # get subsections | |
bio.eb_html = query_web(bio.eb_url, web_queries, do_refresh) | |
bio.eb_text, bio.eb_wc = get_eb_text_wc(bio.eb_html, | |
web_queries, do_recurse, do_refresh) | |
if opts.gender and bio.gender == 'unknown': | |
bio.gender = guess_gender_pronouns(bio.eb_text) | |
break | |
else: | |
info(" not using %s" %result['url']) | |
if opts.gender and bio.gender == 'unknown': | |
bio.gender = guess_gender_name(name) | |
if bio.wp_wc and bio.eb_wc: | |
bio.wp_ratio = bio.wp_wc / float(bio.eb_wc) | |
else: | |
bio.wp_ratio = 0 | |
bios[name] = bio | |
return bios | |
############### | |
# Options and source file | |
############### | |
if __name__=='__main__': | |
opt_parser = OptionParser(usage="usage: %prog [options] file") | |
opt_parser.add_option("-a", "--ambiguous", | |
action="store_true", default=False, | |
help="don't disambuate WP pages w/ disambigbox") | |
opt_parser.add_option("-g", "--gender", | |
action="store_true", default=False, | |
help="perform gender analysis") | |
opt_parser.add_option('-l', '--log-to-file', | |
action="store_true", default=False, | |
help="log to file comp-topics.log") | |
opt_parser.add_option("-n", "--no-dates", | |
action="store_true", default=False, | |
help="don't use source birth/death dates") | |
opt_parser.add_option("-f", "--fast-cache", | |
action="store_true", default=False, | |
help="use cache if available, else query Web") | |
opt_parser.add_option("-c", "--cache-only", | |
action="store_true", default=False, | |
help="cache only, do no Web queries") | |
opt_parser.add_option("-r", "--refresh", | |
help="refresh a particular name in bios cache", | |
metavar="NAME") | |
opt_parser.add_option("-d", "--delete", | |
help="delete a particular name from bios cache", | |
metavar="NAME") | |
opt_parser.add_option("-t", "--text-include", | |
action="store_true", default=False, | |
help="include text of article in results {{copyvio}}") | |
opt_parser.add_option("-e", "--export-csv", | |
action="store_true", default=False, | |
help="export a comma seperated file in addition to HTML") | |
opt_parser.add_option('-v', '--verbose', | |
action='count', | |
help="increase verbosity (specify multiple times for more)") | |
opts, args = opt_parser.parse_args() | |
if opts.refresh: | |
opts.refresh = unicode(opts.refresh.decode('utf-8')) | |
if opts.log_to_file: | |
log_dest = open('topics-comp.log', 'w') | |
else: | |
log_dest = sys.stderr | |
log_level = 100 # default | |
if opts.verbose == 1: log_level = logging.CRITICAL # DEBUG | |
elif opts.verbose == 2: log_level = logging.INFO | |
elif opts.verbose >= 3: log_level = logging.DEBUG | |
logging.basicConfig(stream = log_dest, level=log_level, | |
format = "%(levelno)s %(funcName).5s: %(message)s") | |
critical = logging.critical | |
info = logging.info | |
dbg = logging.debug | |
source_fn = args[0] | |
source_fn_base = source_fn.split('.')[0] | |
data = codecs.open(source_fn, 'r', 'utf-8').readlines() | |
people_url = data[0].strip() # first line in url of source | |
people = [p.strip() for p in data[1:]] | |
bios = build_bios(source_fn_base, people) | |
create_html_report(source_fn_base, bios, people, people_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment