Created May 16, 2013 12:25
The program I used to analyze the balance of gender coverage in Wikipedia and Britannica. The gender guessing heuristics might be useful to others.
# -*- coding: utf-8 -*-
# (c) Copyright 2011-2013 by Joseph Reagle
# Licensed under the GPLv3, see <>
import codecs
from difflib import SequenceMatcher
from cfiledict import FileDict # compressed FileDict
#from filedict import FileDict #
import logging
import lxml
import lxml.etree
from lxml.html.clean import clean_html
from lxml.html import builder as E #
from lxml.html import fragment_fromstring #
from StringIO import StringIO
from optparse import OptionParser
import random
import re
from rpy import r
import simplejson
import sys
import time
import unicodedata
import urllib
import urllib2
from web_little import get_HTML #
from os import environ
HOME = environ['HOME']
#import socket
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
# Exceptions
class FailedGet(Exception):
'''Exception for when one can't return something.'''
# Maths
def L_ratio(seq1, seq2):
'''Sugar function for difflib.SequenceMatcher.ratio()'''
dbg(" comparing '%s' w/ '%s'" %(seq1, seq2))
return round(SequenceMatcher(None, seq1, seq2).ratio(), 2)
# String functions
from htmlentitydefs import name2codepoint
name2codepoint['#39'] = 39 # python 2.5.2 doesn't have apostrophe
def unescape(s):
"""Unescape HTML code refs; c.f."""
return re.sub('&(%s);' % '|'.join(name2codepoint),
lambda m: unichr(name2codepoint[]), s)
def strip_accents(text):
"""Test if ascii, if not, remove accents.
#>>> strip_accents(u'nôn-åscîî') # fails because of doctest bug
def not_combining(char):
return unicodedata.category(char) != 'Mn'
try: # test if ascii
except UnicodeEncodeError:
normalized_text = unicodedata.normalize('NFD', text)
result_text = filter(not_combining, normalized_text)
result_text = text
result_text = result_text.replace('-', ' ').replace('"','').strip()
return result_text
def are_similar(bio, result):
"""Compare list title with returned titles for relatedness."""
title = result['titleNoFormatting'].split(' -')[0].strip() # remove encyclopedia
for name in' OR '):
if '(' not in name: # if no paren in name, remove paren from title
title = title.split('(')[0]
if ', ' in title: # Britannica sometimes uses Last, First
if len(title.split(',')) == 2:
last, first = title.split(', ')
title = first + ' ' + last
dbg(" comparing %s" %name)
name_chars = unescape(strip_accents(name)).lower()
title_chars = unescape(strip_accents(title)).lower()
name_wrds = name_chars.split()
title_wrds = title_chars.split()
name_set = set(name_wrds)
title_set = set(title_wrds)
if (name_set.issubset(title_set) or title_set.issubset(name_set)) and \
(len(title_set) > 1 and 'surname' not in title_set):
dbg(" PASSED sup/sub %s" % title_set)
return True
if len(name_wrds) == 1 or len(title_wrds) == 1:
words_threshold = 0.68 # 0.67
chars_threshold = 0.77 # 0.68
if len(name_wrds) == 2:
words_threshold = 0.62
chars_threshold = 0.71 # 0.69
if len(name_wrds) == 3:
words_threshold = 0.62
chars_threshold = 0.70 # 0.69
if len(name_wrds) >= 4:
words_threshold = 0.45 # 0.40
chars_threshold = 0.60
if len(name_wrds) >= 5: # large name have titles often out of order
name_wrds = sorted(name_wrds)
title_wrds = sorted(title_wrds)
name_chars = ' '.join(name_wrds)
title_chars = ' '.join(title_wrds)
words_threshold = 0.81
chars_threshold = 0.80
words_ratio = L_ratio(name_wrds, title_wrds)
if words_ratio >= words_threshold:
dbg(u" PASSED words %2.2f >= %2.2f '%s'" % (
words_ratio, words_threshold, title_wrds))
return True
dbg(u" FAILED words %2.2f < %2.2f '%s'" % (
words_ratio, words_threshold, title_wrds))
chars_ratio = L_ratio(name_chars, title_chars)
if chars_ratio >= chars_threshold:
dbg(u" PASSED chars %2.2f >= %2.2f '%s'" % (
chars_ratio, chars_threshold, title_chars))
return True
dbg(u" FAILED chars %2.2f < %2.2f '%s'" % (
chars_ratio, chars_threshold, title_chars))
return False
def split_name_date(person):
"""Some source lists have optional dates for query, split them up"""
dbg("person = '%s'" % person)
name = born = died = ''
if '<' in person:
name, date = person.rsplit(' <', 1)
born, died = date[:-1].split(' - ')
name = person
if opts.no_dates:
born = died = ''
return name, born, died
# Create report
def create_html_report(source_fn_base, bios, people, people_url):
"""Return HTML report of topical encyclopedic converage."""
critical("performing analysis for report")
report_f = + '.html', 'w', 'UTF-8', 'replace')
if opts.export_csv:
csv_f = + '.csv', 'w', 'UTF-8', 'replace')
DOCTYPE = u'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "">'
html = E.HTML(
E.TITLE("%s Biographies" % source_fn_base),
E.LINK(rel="stylesheet", type="text/css",
".male {background-color: #f2f2ff}"
".female {background-color: #fff2fd}"
".unknown {background-color: #f4fff2}", type="text/css")
# for R dataframe
genders = []
wp_sizes = []
eb_sizes = []
for person in people:
key, dummy, dummy = split_name_date(person)
bio = bios[key]
# Statistical analysis
r.assign('refwork', r.data_frame(gender=genders, wp_size=wp_sizes, eb_size=eb_sizes))
SRCmale = r('sum(refwork$gender == "male")')
SRCfem = r('sum(refwork$gender == "female")')
SRCun = r('sum(refwork$gender == "unknown")')
WPmale = r('sum(refwork$gender == "male" & refwork$wp.size != 0)')
WPfem = r('sum(refwork$gender == "female" & refwork$wp.size != 0)')
EBmale = r('sum(refwork$gender == "male" & refwork$eb.size != 0)')
EBfem = r('sum(refwork$gender == "female" & refwork$eb.size != 0)')
SRCgendered = r('sum(refwork$gender != "unknown")')
WPgendered = r('sum(refwork$gender != "unknown" & refwork$wp.size != 0)')
EBgendered = r('sum(refwork$gender != "unknown" & refwork$eb.size != 0)')
inSRC = r('length(refwork$gender)')
missingEB = r('sum(refwork$eb.size == 0)')
missingWP = r('sum(refwork$wp.size == 0)')
missingBoth = r('sum(refwork$eb.size == 0 & refwork$wp.size == 0)')
wp_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
for k,v in r('summary(refwork$wp.size[refwork$wp.size != 0], digits=5)').items()))
eb_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
for k,v in r('summary(refwork$eb.size[refwork$eb.size != 0], digits=5)').items()))
wp_median = r('median(refwork$wp.size[refwork$wp.size != 0])')
eb_median = r('median(refwork$eb.size[refwork$eb.size != 0])')
mutual_wp_median = r('median(refwork$wp.size[refwork$wp.size != 0 & '
'refwork$eb.size != 0])')
mutual_eb_median = r('median(refwork$eb.size[refwork$eb.size != 0 & '
'refwork$wp.size != 0])')
body = E.BODY(
E.H1("%s Biographies" % source_fn_base)
"Missing: WP = %d ; EB = %d ; neither = %d . "
%(missingWP, missingEB, missingBoth)),
if opts.gender:
gender_text = (
"<p>Of %d entries: I guess that %d are <span class='female'>female</span>, "
"%d are <span class='male'>male</span> "
"and %d are <span class='unknown'>unknown</span>. "
"That is, females are %0.2f of the gender-known population. "
"Of the Wikipedia articles, females are %0.2f (%d/(%d+%d)); "
"and %0.2f (%d/(%d+%d)) at Britannica. </p>"
% (
inSRC, SRCfem,
float(WPfem)/(WPgendered), WPfem, WPmale, WPfem,
float(EBfem)/(EBgendered), EBfem, EBmale, EBfem
"Existing median word count (for articles that exist in each work alone): WP = %0.0f ; EB = %0.0f . "
"WP median article size is roughly %0.1f times larger."
%(wp_median, eb_median, wp_median/eb_median) ),
E.P("Five figure summaries for existing articles."),
"WP: %s \n"
"EB: %s "
%(wp_existing_summary, eb_existing_summary) ),
"Mutual median word count (for articles that exist in both works only):, WP median = %0.0f words, EB = %0.0f . "
"Mutual WP median article size is roughly %0.1f times larger."
%(mutual_wp_median, mutual_eb_median, mutual_wp_median/mutual_eb_median) ),
r('png(file="%s.png")' % source_fn_base) # ,width=733,height=550
r('boxplot(refwork$wp.size[refwork$wp.size != 0], refwork$eb.size[refwork$eb.size != 0], '
'names = c("Wikipedia", "Britannica"), '
'main = "Existing article word counts")' )
E.IMG(alt="size distribution", src="%s.png" % source_fn_base)
critical("generating table")
table = E.TABLE(
E.COL(), E.COL(E.CLASS('col-alt')), E.COL(),
E.COL(E.CLASS('col-alt')), E.COL(),
E.TH('Wikipedia', width="35%"), E.TH('Words'),
E.TH('Britannica', width="35%"), E.TH('Words'))),
width="100%", cellpadding="5", border="1")
#if opts.export_csv:
#csv_f.write('source, name, gender, born, died, list, count \n')
for person in people:
key, dummy, dummy = split_name_date(person)
bio = bios[key]
E.TD(E.A(, href='%s' % (people_url)), # , bio.last_name
E.TD(E.A(bio.wp_title, href=bio.wp_purl), E.CLASS(bio.gender)),
E.TD(str(bio.wp_wc)), # size
E.TD(E.A(bio.eb_title, href=bio.eb_url), E.CLASS(bio.gender)),
E.TD(str(bio.eb_wc)), # size
valign = "center",
if opts.text_include:
E.TD(colspan='2', *[E.P(p) for p in bio.wp_text.split('\n')]),
E.TD(colspan='2', *[E.P(p) for p in bio.eb_text.split('\n')]),
if opts.export_csv:
for source in source_fn_base.split('-'):
if source.isalpha():
csv_f.write(u'%s;%s;%s;%s;%s;WP;%s\n' %
(source,, bio.gender, bio.born, bio.died, bio.wp_wc) )
csv_f.write(u'%s;%s;%s;%s;%s;EB;%s\n' %
(source,, bio.gender, bio.born, bio.died, bio.eb_wc) )
report_f.write(DOCTYPE + lxml.html.tostring(html, pretty_print=True))
if opts.export_csv:
# Web
def url_OK(url):
'''Check if URL is in right Web space and with textual extension.'''
dbg(" testing %s" % url)
is_ok = True
BAD_EXTENSIONS = ('doc', 'pdf', 'jpg', 'png', 'gif') # Google
BAD_WP_NAMESPACES = ('/Wikiquote:', '/Wikisource:')
if url.split('.')[-1].lower() in BAD_EXTENSIONS:
is_ok = False
if '' in url:
if '/EBchecked/topic/' not in url:
is_ok = False
if '' in url:
if '(disambiguation)' in url: # WP
is_ok = False
if any([ns in url for ns in BAD_WP_NAMESPACES]):
is_ok = False
dbg(" url_OK = %s" % is_ok)
return is_ok
def query_google(query, google_queries, do_refresh, retry_counter=0):
"""Return ordered results from Google API via google_queries cache."""
info(" testing google_queries for '%s'" % query)
results = google_queries.get(query, None)
if do_refresh or (results is None and not opts.cache_only):
info(" refresh or g_cache MISS for %s" %query)
url = "" + \
"search/web?start=0&v=1.0&%s" % ( # &rsz=large for 8 results
urllib.urlencode({'q': query.encode('utf-8')}))
request = urllib2.Request( url, None, {'Referer': ''})
search_results = urllib2.urlopen(request)
json = simplejson.loads(
dbg(" responseStatus = %s" %json['responseStatus'])
if json['responseStatus'] == 404:
raise FailedGet("Google API: %s" %json['responseDetails'])
results = json['responseData']['results']
info(" got results")
except (urllib2.URLError) as e: # TypeError,
info(" retrying query_google after %s" %e)
time.sleep((retry_counter * 10 + 5)) # pause before retying
results = query_google(query, google_queries, do_refresh, retry_counter + 1)
google_queries[query] = results
if retry_counter == 2:
raise FailedGet("Failed to get after 3 attempts.")
if results is None and opts.cache_only:
raise FailedGet("No cache result found with cache-only option.")
info(" returning results")
return results
def query_web(url, web_queries, do_refresh):
"""Grab Web page (encyclopedic article) from URL via web_queries cache."""
url = url.rsplit('#')[0] # don't bother with fragments
info(" testing web_queries for '%s'" % url)
dbg(" do_refresh = %s, opts.refresh = %s opts.cache_only = %s" %
(do_refresh, opts.refresh, opts.cache_only))
html = web_queries.get(url, None)
if do_refresh or (html is None and not opts.cache_only):
dbg(" retrieving from Web")
time.sleep(random.randint(1, 3))
html, response = get_HTML(url)
web_queries[url] = html
dbg(" found in web_queries cache")
if html is None and cache_only:
raise FailedGet("No cache result found with cache-only option.")
return html
def get_text(content_node):
'''Return textual content of nodes sans elements.'''
content_text = ''.join(content_node.xpath("descendant-or-self::text()"))
return content_text
def remove_node(node):
'''Remove a node from a tree.'''
parent = node.getparent()
def get_wp_text_wc(html):
'''Return content of article in textual form without miscellany.'''
def trim_wp(node):
'''To keep analysis comparable, remove most everything but article prose.'''
remove_node(node[0]) # "From Wikipedia, the free encyclopedia"
remove_node(node[1]) # Jump to:navigation,
for n in node.xpath('//table'):
except IndexError:
EXCLUDED_SECTIONS = ('sSee_also', 'Further_Reading', 'Further_reading',
'Notes', 'External_Links', 'References', 'External_links'
'Footnote', 'Footnotes', 'Sources', 'Notes_and_sources',
'Bibliography', 'Notes_and_references',
'Works', 'Publications',
'List_of_works', 'Selected_works', 'Novels',
'Selected_bibliography', 'Publications_and_speeches',
'Records', 'Other_Records', 'Career_statistics','Teams_and_victories',
'Accomplishments', 'Championships_and_accomplishments',
'Records_and_achievements', 'Major_Projects', 'Discography',
'Original_Compositions', 'Videography', 'Notable_works',
'Filmography', 'Selected filmography', 'Partial_filmography',
'Music_videography', 'Books',
'Achievements', 'Notable_roles',
'Television_appearances', 'In_popular_culture',
'Titles_and_honours', 'Honours', 'Awards', 'Awards_and_fellowships',
'Awards_and_nominations', 'Career_highlights',
'Awards_and_honors', 'Recognition', 'Selected_awards', 'Other honors',
'Honors_and_awards', 'Awards_and_recognitions',
'Titles.2C_styles_and_honours', 'Honorary_degrees', 'Prizes',
misc_node = node.xpath('//span[@id="%s"]/ancestor::h2' % misc)[0]
[remove_node(n) for n in misc_node.xpath('./following-sibling::*')]
except IndexError: # in case no node found
return node
html = clean_html(html)
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
remove_pis = True, strip_cdata = True)
doc = etree.parse(StringIO(html), html_parser)
div = doc.xpath("//div[@id='bodyContent']")[0]
#dbg(" div = %s" % ('\n'.join(n.tag for n in div[1:10])))
#dbg(lxml.html.tostring(div, pretty_print=True))
div = trim_wp(div)
text = get_text(div)
text = text.replace('[edit]', '')
return text, len(text.split())
def get_eb_text_wc(html, web_queries, do_recurse, do_refresh):
'''Return content of article in textual form without miscellany.
Will follow links if necessary since EB pages are dynamic.
def trim_eb(node):
'''Trim material following the citations.'''
misc_node = node.xpath('//h2[text() = "Citations"]')[0]
dbg(" misc_node %s" %misc_node)
[remove_node(n) for n in misc_node.xpath('./following::*')]
dbg(" trimmed it!")
except IndexError: # in case no node found
return node
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
remove_pis = True, strip_cdata = True)
doc = etree.parse(StringIO(html), html_parser)
if do_recurse: # EB articles might have other sections to fetch
"Major Works", "Additional Reading", "Biographies",
"Critical studies", "Related Articles", "Supplemental Information",
"Quotations", "Spotlights", "External Web sites", "Citations",
"Year in Review Links",)
info(" YES recurse")
text = ''
# Find links to other sections
toc_options = doc.xpath("//div[@id='bps-article-toc']/select/option")
info("toc_options = %s" % toc_options)
toc_options.pop(1) # skip the second since its included in Main
dbg(" toc_options %s" %toc_options)
for opt in toc_options:
opt_url = '' + opt.get('value')
if opt.get('title') not in EXCLUDED_SECTIONS and '#' not in opt_url:
dbg(" checking %s" %opt.get('title'))
dbg(" opt_url %s" %opt_url)
html = query_web(opt_url, web_queries, do_refresh)
html_parser = etree.HTMLParser(remove_comments = True)
doc = etree.parse(StringIO(html), html_parser)
div = doc.xpath('//div[@class="KonaBody"]') # /ancestor::div
if div:
div = trim_eb(div[0])
text += get_text(div)
dbg(" didn't find content in %s" %opt.get('title'))
dbg(" NO recurse")
div = doc.xpath('//div[@id="bps-left-article-wrapper"]')[0]
div = trim_eb(div)
dbg(" div %s" %div)
text = get_text(div)
return text, len(text.split())
wgCurRevisionId_regexp = re.compile('wgCurRevisionId=(\d+)')
oldid_regexp = re.compile('oldid=(\d+)')
def get_wp_oldid(wp_html):
'''Return oldid for construction of permanent URI.'''
wgCurRevisionId =
info("wgCurRevisionId.groups() = %s" %wgCurRevisionId.groups())
if wgCurRevisionId:
return wgCurRevisionId.groups()[0]
oldid =
if oldid:
info("oldid.groups() = %s" %oldid.groups())
return oldid.groups()[0]
critical("No oldid found")
raise FailedGet
def get_wp_purl(bio):
'''Return formatted permanent URI.'''
root_url = u''
return root_url + bio.wp_url.split('/')[-1] + '&oldid=' + bio.wp_oldid
# Gender guessing functions
he_re = re.compile(r'\b([Hh]is|[Hh]e)\b')
she_re = re.compile(r'\b([Hh]er|[Ss]he)\b')
def guess_gender_pronouns(text):
'''Guess gender based on proportion of pronouns.'''
info("guessing gender via pronouns")
she = len(she_re.findall(text)) if she_re.findall(text) else 0
he = len(he_re.findall(text)) if he_re.findall(text) else 0
diff = abs(she - he)/(he + she + 0.1)
dbg("he = %d, she = %d, diff = %f" %(he, she, diff))
if diff < 0.25:
gender = 'unknown'
elif she > he:
gender = 'female'
gender = 'male'
dbg("gender = %s" %gender)
return gender
def create_name_dict(fn):
'''Utility function to build dictionaries of name frequencies.'''
d = {}
names = open(fn).readlines()
for line in names[1:]:
if ' na ' not in line:
name, frequency, number, rank = line.split()
d[name] = {}
d[name]['freq'] = float(frequency)
return d
FEMALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-female.csv')
MALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-male.csv')
HONORIFIC_MALE = [honor + ' ' for honor in
('Baron', 'Brother', 'Comte', 'Count', 'Duc', 'Duke', 'Earl of',
'Father', 'Marquess', 'Marquis', 'Prince', 'Sir',
'Viscount', 'Vicomte' )]
HONORIFIC_FEMALE = [honor + ' ' for honor in
('Baroness', 'Comtesse', 'Countess', 'Dame',
'Marchioness', 'Marquise', 'Princess', 'Sister', 'Queen')]
def guess_gender_name(name):
'''Guess the gender based only on the name using honorifics and
statistical tables of name frequencies
>>> guess_gender_name('Joseph Reagle')
>>> guess_gender_name('Sir Jehne Smith')
>>> guess_gender_name('Dame Jijij Foo')
info("guessing gender via names")
gender = 'unknown'
if any([honor in name for honor in HONORIFIC_MALE]):
gender = 'male'
elif any([honor in name for honor in HONORIFIC_FEMALE]):
gender = 'female'
if gender == 'unknown':
name = name.replace('Dr. ', '')
given = name.split()[0].upper()
if given in FEMALE_NAMES and given in MALE_NAMES:
dbg("freqs = %f %f" % (MALE_NAMES[given]['freq'], FEMALE_NAMES[given]['freq']) )
if MALE_NAMES[given]['freq'] > 4 * FEMALE_NAMES[given]['freq']:
gender = 'male'
elif FEMALE_NAMES[given]['freq'] > 4 * MALE_NAMES[given]['freq']:
gender = 'female'
if given in MALE_NAMES:
gender = 'male'
if given in FEMALE_NAMES:
gender = 'female'
info("gender = %s" %gender)
return gender
# Biography class
class Biography():
name = ''
last_name = ''
born = ''
died = ''
gender = 'unknown'
wp_title = ''
wp_url = ''
wp_purl = ''
wp_html = ''
wp_text = ''
wp_wc = 0
wp_ratio = 0
wp_oldid = ''
eb_title = ''
eb_url = ''
eb_html = ''
eb_text = ''
eb_wc = 0
def __init__(self, name, born = '', died = ''): = name
self.last_name = name.split(' OR ')[0].split('(')[0].split()[-1] # use first last name
self.born = born
self.died = died
def __str__(self):
#showList = ["a", "b"]
showList = sorted(set(self.__dict__))
return ("X(%i):\n" % id(self)) + "\n".join([" %s: %s" % (
key.rjust(8), self.__dict__[key]) for key in showList])
# Web scrape and build bios
def build_bios(people_proj, people):
"""Build biography for each person including EB/WP information."""
bios = FileDict(filename='cache/bios-%s' % people_proj + '.db')
google_queries = FileDict(filename='cache/google-%s' % people_proj + '.db')
web_queries = FileDict(filename='cache/web-%s' % people_proj + '.db')
if opts.delete:
del bios[opts.delete]
return bios
CUTOFF = 1900
for person in people:
name, born, died = split_name_date(person)
critical("** Checking '%s' <%s - %s> **" % (name, born, died))
bio = Biography(name, born, died)
if opts.refresh and in opts.refresh: # so I can pass in more than one name
do_refresh = True
do_refresh = False
if name in bios:
if opts.fast_cache or opts.cache_only:
info(" '%s' is duplicated, skipping" % name)
if opts.cache_only:
critical("No cached bio found for '%s'" %name)
raise FailedGet
site = ''
if born and int(born) < CUTOFF:
query = 'site:%s (%s)' % (site, name)
query = 'site:%s (%s %s)' % (site, name, born)
dbg(u" query = %s" % query)
results = query_google(query, google_queries, do_refresh)
info(" results %s" % [r['titleNoFormatting'] for r in results])
for result in results:
if url_OK(result['url']) and are_similar(bio, result):
info(" using %s" %result['url'])
bio.wp_title = unescape(result['titleNoFormatting'] \
.split(' -')[0])
bio.wp_url = urllib2.unquote(result['url'].encode()).decode(
'utf-8', 'replace')
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
#if not opts.ambiguous and \
#('disambigbox' in bio.wp_html or 'setindexbox' in bio.wp_html):
DISAMBIG = ('disambigbox', 'setindexbox')
if not opts.ambiguous and any([disambig in bio.wp_html for disambig in DISAMBIG]):
critical("disambiguating %s at %s" % (bio.wp_title, bio.wp_url))
bio.wp_title = bio.wp_title + ' [disambiguated]'
parser_dbox = etree.HTMLParser(remove_comments = True)
doc_dbox = etree.parse(StringIO(bio.wp_html), parser_dbox)
url_dbox = '' + doc_dbox.xpath(
critical("url_dbox %s" % url_dbox)
bio.wp_url = url_dbox
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
#info(" bio.wp_html %s" %bio.wp_html)
bio.wp_oldid = get_wp_oldid(bio.wp_html)
bio.wp_purl = get_wp_purl(bio)
bio.wp_text, bio.wp_wc = \
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_pronouns(bio.wp_text)
info(" not using %s" %result['url'])
site = '' # /EBchecked/topic/
if born and int(born) < CUTOFF:
query = 'site:%s (%s)' % (site, name)
query = 'site:%s (%s %s)' % (site, name, born)
info(" query = %s" %query)
results = query_google(query, google_queries, do_refresh)
info(" results %s" %[r['titleNoFormatting'] for r in results])
for result in results:
if url_OK(result['url']) and are_similar(bio, result):
info(" using %s" %result['url'])
do_recurse = False
bio.eb_title = \
unescape(result['titleNoFormatting'].split(' --')[0])
bio.eb_url = urllib2.unquote(result['url'].encode())
if bio.eb_url.split('/')[-3] == 'topic':
do_recurse = True # get subsections
bio.eb_html = query_web(bio.eb_url, web_queries, do_refresh)
bio.eb_text, bio.eb_wc = get_eb_text_wc(bio.eb_html,
web_queries, do_recurse, do_refresh)
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_pronouns(bio.eb_text)
info(" not using %s" %result['url'])
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_name(name)
if bio.wp_wc and bio.eb_wc:
bio.wp_ratio = bio.wp_wc / float(bio.eb_wc)
bio.wp_ratio = 0
bios[name] = bio
return bios
# Options and source file
if __name__=='__main__':
opt_parser = OptionParser(usage="usage: %prog [options] file")
opt_parser.add_option("-a", "--ambiguous",
action="store_true", default=False,
help="don't disambuate WP pages w/ disambigbox")
opt_parser.add_option("-g", "--gender",
action="store_true", default=False,
help="perform gender analysis")
opt_parser.add_option('-l', '--log-to-file',
action="store_true", default=False,
help="log to file comp-topics.log")
opt_parser.add_option("-n", "--no-dates",
action="store_true", default=False,
help="don't use source birth/death dates")
opt_parser.add_option("-f", "--fast-cache",
action="store_true", default=False,
help="use cache if available, else query Web")
opt_parser.add_option("-c", "--cache-only",
action="store_true", default=False,
help="cache only, do no Web queries")
opt_parser.add_option("-r", "--refresh",
help="refresh a particular name in bios cache",
opt_parser.add_option("-d", "--delete",
help="delete a particular name from bios cache",
opt_parser.add_option("-t", "--text-include",
action="store_true", default=False,
help="include text of article in results {{copyvio}}")
opt_parser.add_option("-e", "--export-csv",
action="store_true", default=False,
help="export a comma seperated file in addition to HTML")
opt_parser.add_option('-v', '--verbose',
help="increase verbosity (specify multiple times for more)")
opts, args = opt_parser.parse_args()
if opts.refresh:
opts.refresh = unicode(opts.refresh.decode('utf-8'))
if opts.log_to_file:
log_dest = open('topics-comp.log', 'w')
log_dest = sys.stderr
log_level = 100 # default
if opts.verbose == 1: log_level = logging.CRITICAL # DEBUG
elif opts.verbose == 2: log_level = logging.INFO
elif opts.verbose >= 3: log_level = logging.DEBUG
logging.basicConfig(stream = log_dest, level=log_level,
format = "%(levelno)s %(funcName).5s: %(message)s")
critical = logging.critical
info =
dbg = logging.debug
source_fn = args[0]
source_fn_base = source_fn.split('.')[0]
data =, 'r', 'utf-8').readlines()
people_url = data[0].strip() # first line in url of source
people = [p.strip() for p in data[1:]]
bios = build_bios(source_fn_base, people)
create_html_report(source_fn_base, bios, people, people_url)
