Skip to content

Instantly share code, notes, and snippets.

@reagle
Created May 16, 2013 12:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reagle/5591363 to your computer and use it in GitHub Desktop.
Save reagle/5591363 to your computer and use it in GitHub Desktop.
The program I used to analyze the balance of gender coverage in Wikipedia and Britannica. The gender guessing heuristics might be useful to others.
#!/usr/bin/python2.6
# -*- coding: utf-8 -*-
# (c) Copyright 2011-2013 by Joseph Reagle
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html>
import codecs
from difflib import SequenceMatcher
from cfiledict import FileDict # compressed FileDict
#from filedict import FileDict # http://erezsh.wordpress.com/2009/05/24/filedict-a-persistent-dictionary-in-python/
import logging
import lxml
import lxml.etree
from lxml.html.clean import clean_html
from lxml.html import builder as E # http://effbot.org/zone/element-builder.htm
from lxml.html import fragment_fromstring # http://codespeak.net/lxml/lxmlhtml.html
from StringIO import StringIO
from optparse import OptionParser
import random
import re
from rpy import r
import simplejson
import sys
import time
import unicodedata
import urllib
import urllib2
from web_little import get_HTML # http://bitbucket.org/reagle/thunderdell/src/tip/web.py
from os import environ
HOME = environ['HOME']
#import socket
#socket.setdefaulttimeout(20)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
###############
# Exceptions
###############
class FailedGet(Exception):
'''Exception for when one can't return something.'''
pass
###############
# Maths
###############
def L_ratio(seq1, seq2):
'''Sugar function for difflib.SequenceMatcher.ratio()'''
dbg(" comparing '%s' w/ '%s'" %(seq1, seq2))
return round(SequenceMatcher(None, seq1, seq2).ratio(), 2)
###############
# String functions
###############
from htmlentitydefs import name2codepoint
name2codepoint['#39'] = 39 # python 2.5.2 doesn't have apostrophe
def unescape(s):
"""Unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml."""
return re.sub('&(%s);' % '|'.join(name2codepoint),
lambda m: unichr(name2codepoint[m.group(1)]), s)
def strip_accents(text):
"""Test if ascii, if not, remove accents.
"""
#>>> strip_accents(u'nôn-åscîî') # fails because of doctest bug
#u'non-ascii'
def not_combining(char):
return unicodedata.category(char) != 'Mn'
try: # test if ascii
text.encode('ascii')
except UnicodeEncodeError:
normalized_text = unicodedata.normalize('NFD', text)
result_text = filter(not_combining, normalized_text)
else:
result_text = text
result_text = result_text.replace('-', ' ').replace('"','').strip()
return result_text
def are_similar(bio, result):
"""Compare list title with returned titles for relatedness."""
title = result['titleNoFormatting'].split(' -')[0].strip() # remove encyclopedia
for name in bio.name.split(' OR '):
if '(' not in name: # if no paren in name, remove paren from title
title = title.split('(')[0]
if ', ' in title: # Britannica sometimes uses Last, First
if len(title.split(',')) == 2:
last, first = title.split(', ')
title = first + ' ' + last
dbg(" comparing %s" %name)
name_chars = unescape(strip_accents(name)).lower()
title_chars = unescape(strip_accents(title)).lower()
name_wrds = name_chars.split()
title_wrds = title_chars.split()
name_set = set(name_wrds)
title_set = set(title_wrds)
if (name_set.issubset(title_set) or title_set.issubset(name_set)) and \
(len(title_set) > 1 and 'surname' not in title_set):
dbg(" PASSED sup/sub %s" % title_set)
return True
if len(name_wrds) == 1 or len(title_wrds) == 1:
words_threshold = 0.68 # 0.67
chars_threshold = 0.77 # 0.68
else:
if len(name_wrds) == 2:
words_threshold = 0.62
chars_threshold = 0.71 # 0.69
if len(name_wrds) == 3:
words_threshold = 0.62
chars_threshold = 0.70 # 0.69
if len(name_wrds) >= 4:
words_threshold = 0.45 # 0.40
chars_threshold = 0.60
if len(name_wrds) >= 5: # large name have titles often out of order
name_wrds = sorted(name_wrds)
title_wrds = sorted(title_wrds)
name_chars = ' '.join(name_wrds)
title_chars = ' '.join(title_wrds)
words_threshold = 0.81
chars_threshold = 0.80
words_ratio = L_ratio(name_wrds, title_wrds)
if words_ratio >= words_threshold:
dbg(u" PASSED words %2.2f >= %2.2f '%s'" % (
words_ratio, words_threshold, title_wrds))
return True
else:
dbg(u" FAILED words %2.2f < %2.2f '%s'" % (
words_ratio, words_threshold, title_wrds))
chars_ratio = L_ratio(name_chars, title_chars)
if chars_ratio >= chars_threshold:
dbg(u" PASSED chars %2.2f >= %2.2f '%s'" % (
chars_ratio, chars_threshold, title_chars))
return True
else:
dbg(u" FAILED chars %2.2f < %2.2f '%s'" % (
chars_ratio, chars_threshold, title_chars))
return False
def split_name_date(person):
"""Some source lists have optional dates for query, split them up"""
dbg("person = '%s'" % person)
name = born = died = ''
if '<' in person:
name, date = person.rsplit(' <', 1)
born, died = date[:-1].split(' - ')
else:
name = person
if opts.no_dates:
born = died = ''
return name, born, died
###############
# Create report
###############
def create_html_report(source_fn_base, bios, people, people_url):
"""Return HTML report of topical encyclopedic converage."""
critical("performing analysis for report")
report_f = codecs.open(source_fn_base + '.html', 'w', 'UTF-8', 'replace')
if opts.export_csv:
csv_f = codecs.open(source_fn_base + '.csv', 'w', 'UTF-8', 'replace')
DOCTYPE = u'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">'
html = E.HTML(
E.HEAD(
E.TITLE("%s Biographies" % source_fn_base),
E.LINK(rel="stylesheet", type="text/css",
href="../../../2001/reagle.css"),
E.STYLE(
".male {background-color: #f2f2ff}"
".female {background-color: #fff2fd}"
".unknown {background-color: #f4fff2}", type="text/css")
))
# for R dataframe
genders = []
wp_sizes = []
eb_sizes = []
for person in people:
key, dummy, dummy = split_name_date(person)
bio = bios[key]
genders.append(bio.gender)
wp_sizes.append(bio.wp_wc)
eb_sizes.append(bio.eb_wc)
# Statistical analysis
r.assign('refwork', r.data_frame(gender=genders, wp_size=wp_sizes, eb_size=eb_sizes))
SRCmale = r('sum(refwork$gender == "male")')
SRCfem = r('sum(refwork$gender == "female")')
SRCun = r('sum(refwork$gender == "unknown")')
WPmale = r('sum(refwork$gender == "male" & refwork$wp.size != 0)')
WPfem = r('sum(refwork$gender == "female" & refwork$wp.size != 0)')
EBmale = r('sum(refwork$gender == "male" & refwork$eb.size != 0)')
EBfem = r('sum(refwork$gender == "female" & refwork$eb.size != 0)')
SRCgendered = r('sum(refwork$gender != "unknown")')
WPgendered = r('sum(refwork$gender != "unknown" & refwork$wp.size != 0)')
EBgendered = r('sum(refwork$gender != "unknown" & refwork$eb.size != 0)')
inSRC = r('length(refwork$gender)')
missingEB = r('sum(refwork$eb.size == 0)')
missingWP = r('sum(refwork$wp.size == 0)')
missingBoth = r('sum(refwork$eb.size == 0 & refwork$wp.size == 0)')
wp_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
for k,v in r('summary(refwork$wp.size[refwork$wp.size != 0], digits=5)').items()))
eb_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
for k,v in r('summary(refwork$eb.size[refwork$eb.size != 0], digits=5)').items()))
wp_median = r('median(refwork$wp.size[refwork$wp.size != 0])')
eb_median = r('median(refwork$eb.size[refwork$eb.size != 0])')
mutual_wp_median = r('median(refwork$wp.size[refwork$wp.size != 0 & '
'refwork$eb.size != 0])')
mutual_eb_median = r('median(refwork$eb.size[refwork$eb.size != 0 & '
'refwork$wp.size != 0])')
body = E.BODY(
E.H1("%s Biographies" % source_fn_base)
)
html.append(body)
body.extend((
E.P(
"Missing: WP = %d ; EB = %d ; neither = %d . "
%(missingWP, missingEB, missingBoth)),
)
)
if opts.gender:
gender_text = (
"<p>Of %d entries: I guess that %d are <span class='female'>female</span>, "
"%d are <span class='male'>male</span> "
"and %d are <span class='unknown'>unknown</span>. "
"That is, females are %0.2f of the gender-known population. "
"Of the Wikipedia articles, females are %0.2f (%d/(%d+%d)); "
"and %0.2f (%d/(%d+%d)) at Britannica. </p>"
% (
inSRC, SRCfem,
SRCmale,
SRCun,
float(SRCfem)/(SRCgendered),
float(WPfem)/(WPgendered), WPfem, WPmale, WPfem,
float(EBfem)/(EBgendered), EBfem, EBmale, EBfem
))
body.append(fragment_fromstring(gender_text))
body.extend((
E.P(
"Existing median word count (for articles that exist in each work alone): WP = %0.0f ; EB = %0.0f . "
"WP median article size is roughly %0.1f times larger."
%(wp_median, eb_median, wp_median/eb_median) ),
E.P("Five figure summaries for existing articles."),
E.PRE(
"WP: %s \n"
"EB: %s "
%(wp_existing_summary, eb_existing_summary) ),
E.P(
"Mutual median word count (for articles that exist in both works only):, WP median = %0.0f words, EB = %0.0f . "
"Mutual WP median article size is roughly %0.1f times larger."
%(mutual_wp_median, mutual_eb_median, mutual_wp_median/mutual_eb_median) ),
))
r('png(file="%s.png")' % source_fn_base) # ,width=733,height=550
r('boxplot(refwork$wp.size[refwork$wp.size != 0], refwork$eb.size[refwork$eb.size != 0], '
'names = c("Wikipedia", "Britannica"), '
'main = "Existing article word counts")' )
body.append(
E.P(
E.IMG(alt="size distribution", src="%s.png" % source_fn_base)
)
)
critical("generating table")
table = E.TABLE(
E.COL(), E.COL(E.CLASS('col-alt')), E.COL(),
E.COL(E.CLASS('col-alt')), E.COL(),
E.THEAD(E.TR(E.TH('Name'),
E.TH('Wikipedia', width="35%"), E.TH('Words'),
E.TH('Britannica', width="35%"), E.TH('Words'))),
width="100%", cellpadding="5", border="1")
#if opts.export_csv:
#csv_f.write('source, name, gender, born, died, list, count \n')
for person in people:
key, dummy, dummy = split_name_date(person)
bio = bios[key]
table.append(
E.TR(
E.TD(E.A(bio.name, href='%s' % (people_url)), # , bio.last_name
E.CLASS(bio.gender)),
E.TD(E.A(bio.wp_title, href=bio.wp_purl), E.CLASS(bio.gender)),
E.TD(str(bio.wp_wc)), # size
E.TD(E.A(bio.eb_title, href=bio.eb_url), E.CLASS(bio.gender)),
E.TD(str(bio.eb_wc)), # size
valign = "center",
))
if opts.text_include:
table.append(
E.TR(
E.TD(''),
E.TD(colspan='2', *[E.P(p) for p in bio.wp_text.split('\n')]),
E.TD(colspan='2', *[E.P(p) for p in bio.eb_text.split('\n')]),
valign="top",
),
)
if opts.export_csv:
for source in source_fn_base.split('-'):
if source.isalpha():
break
csv_f.write(u'%s;%s;%s;%s;%s;WP;%s\n' %
(source, bio.name, bio.gender, bio.born, bio.died, bio.wp_wc) )
csv_f.write(u'%s;%s;%s;%s;%s;EB;%s\n' %
(source, bio.name, bio.gender, bio.born, bio.died, bio.eb_wc) )
body.append(table)
report_f.write(DOCTYPE + lxml.html.tostring(html, pretty_print=True))
report_f.close()
if opts.export_csv:
csv_f.close()
###############
# Web
###############
def url_OK(url):
'''Check if URL is in right Web space and with textual extension.'''
dbg(" testing %s" % url)
is_ok = True
BAD_EXTENSIONS = ('doc', 'pdf', 'jpg', 'png', 'gif') # Google
BAD_WP_NAMESPACES = ('/Wikiquote:', '/Wikisource:')
if url.split('.')[-1].lower() in BAD_EXTENSIONS:
is_ok = False
if 'britannica.com' in url:
if '/EBchecked/topic/' not in url:
is_ok = False
if 'wikipedia.org' in url:
if '(disambiguation)' in url: # WP
is_ok = False
if any([ns in url for ns in BAD_WP_NAMESPACES]):
is_ok = False
dbg(" url_OK = %s" % is_ok)
return is_ok
def query_google(query, google_queries, do_refresh, retry_counter=0):
"""Return ordered results from Google API via google_queries cache."""
info(" testing google_queries for '%s'" % query)
results = google_queries.get(query, None)
if do_refresh or (results is None and not opts.cache_only):
info(" refresh or g_cache MISS for %s" %query)
url = "http://ajax.googleapis.com/ajax/services/" + \
"search/web?start=0&v=1.0&%s" % ( # &rsz=large for 8 results
urllib.urlencode({'q': query.encode('utf-8')}))
request = urllib2.Request( url, None, {'Referer': 'http://reagle.org/joseph/'})
try:
search_results = urllib2.urlopen(request)
json = simplejson.loads(search_results.read())
dbg(" responseStatus = %s" %json['responseStatus'])
if json['responseStatus'] == 404:
raise FailedGet("Google API: %s" %json['responseDetails'])
results = json['responseData']['results']
info(" got results")
except (urllib2.URLError) as e: # TypeError,
info(" retrying query_google after %s" %e)
time.sleep((retry_counter * 10 + 5)) # pause before retying
results = query_google(query, google_queries, do_refresh, retry_counter + 1)
google_queries[query] = results
if retry_counter == 2:
raise FailedGet("Failed to get after 3 attempts.")
if results is None and opts.cache_only:
raise FailedGet("No cache result found with cache-only option.")
info(" returning results")
return results
def query_web(url, web_queries, do_refresh):
"""Grab Web page (encyclopedic article) from URL via web_queries cache."""
url = url.rsplit('#')[0] # don't bother with fragments
info(" testing web_queries for '%s'" % url)
dbg(" do_refresh = %s, opts.refresh = %s opts.cache_only = %s" %
(do_refresh, opts.refresh, opts.cache_only))
html = web_queries.get(url, None)
if do_refresh or (html is None and not opts.cache_only):
dbg(" retrieving from Web")
time.sleep(random.randint(1, 3))
html, response = get_HTML(url)
web_queries[url] = html
else:
dbg(" found in web_queries cache")
if html is None and cache_only:
raise FailedGet("No cache result found with cache-only option.")
return html
def get_text(content_node):
'''Return textual content of nodes sans elements.'''
content_text = ''.join(content_node.xpath("descendant-or-self::text()"))
return content_text
def remove_node(node):
'''Remove a node from a tree.'''
parent = node.getparent()
parent.remove(node)
def get_wp_text_wc(html):
'''Return content of article in textual form without miscellany.'''
def trim_wp(node):
'''To keep analysis comparable, remove most everything but article prose.'''
remove_node(node[0]) # "From Wikipedia, the free encyclopedia"
remove_node(node[1]) # Jump to:navigation,
try:
for n in node.xpath('//table'):
remove_node(n)
except IndexError:
pass
EXCLUDED_SECTIONS = ('sSee_also', 'Further_Reading', 'Further_reading',
'Notes', 'External_Links', 'References', 'External_links'
'Footnote', 'Footnotes', 'Sources', 'Notes_and_sources',
'Bibliography', 'Notes_and_references',
'Works', 'Publications',
'List_of_works', 'Selected_works', 'Novels',
'Selected_bibliography', 'Publications_and_speeches',
'Records', 'Other_Records', 'Career_statistics','Teams_and_victories',
'Professional_wins',
'Accomplishments', 'Championships_and_accomplishments',
'Records_and_achievements', 'Major_Projects', 'Discography',
'Original_Compositions', 'Videography', 'Notable_works',
'Filmography', 'Selected filmography', 'Partial_filmography',
'Music_videography', 'Books',
'Achievements', 'Notable_roles',
'Television_appearances', 'In_popular_culture',
'Popular_culture',
'Titles_and_honours', 'Honours', 'Awards', 'Awards_and_fellowships',
'Awards_and_nominations', 'Career_highlights',
'Awards_and_honors', 'Recognition', 'Selected_awards', 'Other honors',
'Honors_and_awards', 'Awards_and_recognitions',
'Titles.2C_styles_and_honours', 'Honorary_degrees', 'Prizes',
)
for misc in EXCLUDED_SECTIONS:
try:
misc_node = node.xpath('//span[@id="%s"]/ancestor::h2' % misc)[0]
[remove_node(n) for n in misc_node.xpath('./following-sibling::*')]
remove_node(misc_node)
except IndexError: # in case no node found
pass
return node
html = clean_html(html)
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
remove_pis = True, strip_cdata = True)
doc = etree.parse(StringIO(html), html_parser)
div = doc.xpath("//div[@id='bodyContent']")[0]
#dbg(" div = %s" % ('\n'.join(n.tag for n in div[1:10])))
#dbg(lxml.html.tostring(div, pretty_print=True))
div = trim_wp(div)
text = get_text(div)
text = text.replace('[edit]', '')
return text, len(text.split())
def get_eb_text_wc(html, web_queries, do_recurse, do_refresh):
'''Return content of article in textual form without miscellany.
Will follow links if necessary since EB pages are dynamic.
'''
def trim_eb(node):
'''Trim material following the citations.'''
try:
misc_node = node.xpath('//h2[text() = "Citations"]')[0]
dbg(" misc_node %s" %misc_node)
[remove_node(n) for n in misc_node.xpath('./following::*')]
remove_node(misc_node)
dbg(" trimmed it!")
except IndexError: # in case no node found
pass
return node
html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
remove_pis = True, strip_cdata = True)
doc = etree.parse(StringIO(html), html_parser)
if do_recurse: # EB articles might have other sections to fetch
EXCLUDED_SECTIONS = (
"Major Works", "Additional Reading", "Biographies",
"Critical studies", "Related Articles", "Supplemental Information",
"Quotations", "Spotlights", "External Web sites", "Citations",
"Year in Review Links",)
info(" YES recurse")
text = ''
# Find links to other sections
toc_options = doc.xpath("//div[@id='bps-article-toc']/select/option")
info("toc_options = %s" % toc_options)
toc_options.pop(1) # skip the second since its included in Main
dbg(" toc_options %s" %toc_options)
for opt in toc_options:
opt_url = 'http://www.britannica.com' + opt.get('value')
if opt.get('title') not in EXCLUDED_SECTIONS and '#' not in opt_url:
dbg(" checking %s" %opt.get('title'))
dbg(" opt_url %s" %opt_url)
html = query_web(opt_url, web_queries, do_refresh)
html_parser = etree.HTMLParser(remove_comments = True)
doc = etree.parse(StringIO(html), html_parser)
div = doc.xpath('//div[@class="KonaBody"]') # /ancestor::div
if div:
div = trim_eb(div[0])
text += get_text(div)
else:
dbg(" didn't find content in %s" %opt.get('title'))
dbg(html)
else:
dbg(" NO recurse")
div = doc.xpath('//div[@id="bps-left-article-wrapper"]')[0]
div = trim_eb(div)
dbg(" div %s" %div)
text = get_text(div)
return text, len(text.split())
wgCurRevisionId_regexp = re.compile('wgCurRevisionId=(\d+)')
oldid_regexp = re.compile('oldid=(\d+)')
def get_wp_oldid(wp_html):
'''Return oldid for construction of permanent URI.'''
wgCurRevisionId = wgCurRevisionId_regexp.search(wp_html)
info("wgCurRevisionId.groups() = %s" %wgCurRevisionId.groups())
if wgCurRevisionId:
return wgCurRevisionId.groups()[0]
else:
oldid = oldid_regexp.search(wp_html)
if oldid:
info("oldid.groups() = %s" %oldid.groups())
return oldid.groups()[0]
else:
critical("No oldid found")
raise FailedGet
def get_wp_purl(bio):
'''Return formatted permanent URI.'''
root_url = u'http://en.wikipedia.org/w/index.php?title='
return root_url + bio.wp_url.split('/')[-1] + '&oldid=' + bio.wp_oldid
###############
# Gender guessing functions
###############
he_re = re.compile(r'\b([Hh]is|[Hh]e)\b')
she_re = re.compile(r'\b([Hh]er|[Ss]he)\b')
def guess_gender_pronouns(text):
'''Guess gender based on proportion of pronouns.'''
info("guessing gender via pronouns")
she = len(she_re.findall(text)) if she_re.findall(text) else 0
he = len(he_re.findall(text)) if he_re.findall(text) else 0
diff = abs(she - he)/(he + she + 0.1)
dbg("he = %d, she = %d, diff = %f" %(he, she, diff))
if diff < 0.25:
gender = 'unknown'
elif she > he:
gender = 'female'
else:
gender = 'male'
dbg("gender = %s" %gender)
return gender
def create_name_dict(fn):
'''Utility function to build dictionaries of name frequencies.'''
d = {}
names = open(fn).readlines()
for line in names[1:]:
if ' na ' not in line:
name, frequency, number, rank = line.split()
d[name] = {}
d[name]['freq'] = float(frequency)
return d
FEMALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-female.csv')
MALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-male.csv')
HONORIFIC_MALE = [honor + ' ' for honor in
('Baron', 'Brother', 'Comte', 'Count', 'Duc', 'Duke', 'Earl of',
'Father', 'Marquess', 'Marquis', 'Prince', 'Sir',
'Viscount', 'Vicomte' )]
HONORIFIC_FEMALE = [honor + ' ' for honor in
('Baroness', 'Comtesse', 'Countess', 'Dame',
'Marchioness', 'Marquise', 'Princess', 'Sister', 'Queen')]
def guess_gender_name(name):
'''Guess the gender based only on the name using honorifics and
statistical tables of name frequencies
>>> guess_gender_name('Joseph Reagle')
'male'
>>> guess_gender_name('Sir Jehne Smith')
'male'
>>> guess_gender_name('Dame Jijij Foo')
'female'
'''
info("guessing gender via names")
gender = 'unknown'
if any([honor in name for honor in HONORIFIC_MALE]):
gender = 'male'
elif any([honor in name for honor in HONORIFIC_FEMALE]):
gender = 'female'
if gender == 'unknown':
name = name.replace('Dr. ', '')
given = name.split()[0].upper()
if given in FEMALE_NAMES and given in MALE_NAMES:
dbg("freqs = %f %f" % (MALE_NAMES[given]['freq'], FEMALE_NAMES[given]['freq']) )
if MALE_NAMES[given]['freq'] > 4 * FEMALE_NAMES[given]['freq']:
gender = 'male'
elif FEMALE_NAMES[given]['freq'] > 4 * MALE_NAMES[given]['freq']:
gender = 'female'
else:
if given in MALE_NAMES:
gender = 'male'
if given in FEMALE_NAMES:
gender = 'female'
info("gender = %s" %gender)
return gender
###############
# Biography class
###############
class Biography():
name = ''
last_name = ''
born = ''
died = ''
gender = 'unknown'
wp_title = ''
wp_url = ''
wp_purl = ''
wp_html = ''
wp_text = ''
wp_wc = 0
wp_ratio = 0
wp_oldid = ''
eb_title = ''
eb_url = ''
eb_html = ''
eb_text = ''
eb_wc = 0
def __init__(self, name, born = '', died = ''):
self.name = name
self.last_name = name.split(' OR ')[0].split('(')[0].split()[-1] # use first last name
self.born = born
self.died = died
def __str__(self):
#showList = ["a", "b"]
showList = sorted(set(self.__dict__))
return ("X(%i):\n" % id(self)) + "\n".join([" %s: %s" % (
key.rjust(8), self.__dict__[key]) for key in showList])
###############
# Web scrape and build bios
###############
def build_bios(people_proj, people):
"""Build biography for each person including EB/WP information."""
bios = FileDict(filename='cache/bios-%s' % people_proj + '.db')
google_queries = FileDict(filename='cache/google-%s' % people_proj + '.db')
web_queries = FileDict(filename='cache/web-%s' % people_proj + '.db')
if opts.delete:
del bios[opts.delete]
return bios
CUTOFF = 1900
for person in people:
name, born, died = split_name_date(person)
critical("** Checking '%s' <%s - %s> **" % (name, born, died))
bio = Biography(name, born, died)
if opts.refresh and bio.name in opts.refresh: # so I can pass in more than one name
do_refresh = True
else:
do_refresh = False
if name in bios:
if opts.fast_cache or opts.cache_only:
info(" '%s' is duplicated, skipping" % name)
continue
else:
if opts.cache_only:
critical("No cached bio found for '%s'" %name)
raise FailedGet
info("WIKIPEDIA")
site = 'en.wikipedia.org'
if born and int(born) < CUTOFF:
query = 'site:%s (%s)' % (site, name)
else:
query = 'site:%s (%s %s)' % (site, name, born)
dbg(u" query = %s" % query)
results = query_google(query, google_queries, do_refresh)
info(" results %s" % [r['titleNoFormatting'] for r in results])
for result in results:
if url_OK(result['url']) and are_similar(bio, result):
info(" using %s" %result['url'])
bio.wp_title = unescape(result['titleNoFormatting'] \
.split(' -')[0])
bio.wp_url = urllib2.unquote(result['url'].encode()).decode(
'utf-8', 'replace')
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
#if not opts.ambiguous and \
#('disambigbox' in bio.wp_html or 'setindexbox' in bio.wp_html):
DISAMBIG = ('disambigbox', 'setindexbox')
if not opts.ambiguous and any([disambig in bio.wp_html for disambig in DISAMBIG]):
critical("disambiguating %s at %s" % (bio.wp_title, bio.wp_url))
bio.wp_title = bio.wp_title + ' [disambiguated]'
parser_dbox = etree.HTMLParser(remove_comments = True)
doc_dbox = etree.parse(StringIO(bio.wp_html), parser_dbox)
url_dbox = 'http://en.wikipedia.org' + doc_dbox.xpath(
"//div[@id='bodyContent']/ul//a[not(@class='new')]/@href")[0]
critical("url_dbox %s" % url_dbox)
bio.wp_url = url_dbox
bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
#info(" bio.wp_html %s" %bio.wp_html)
bio.wp_oldid = get_wp_oldid(bio.wp_html)
bio.wp_purl = get_wp_purl(bio)
bio.wp_text, bio.wp_wc = \
get_wp_text_wc(bio.wp_html)
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_pronouns(bio.wp_text)
break
else:
info(" not using %s" %result['url'])
info("BRITANNICA")
site = 'www.britannica.com' # /EBchecked/topic/
if born and int(born) < CUTOFF:
query = 'site:%s (%s)' % (site, name)
else:
query = 'site:%s (%s %s)' % (site, name, born)
info(" query = %s" %query)
results = query_google(query, google_queries, do_refresh)
info(" results %s" %[r['titleNoFormatting'] for r in results])
for result in results:
if url_OK(result['url']) and are_similar(bio, result):
info(" using %s" %result['url'])
do_recurse = False
bio.eb_title = \
unescape(result['titleNoFormatting'].split(' --')[0])
bio.eb_url = urllib2.unquote(result['url'].encode())
if bio.eb_url.split('/')[-3] == 'topic':
do_recurse = True # get subsections
bio.eb_html = query_web(bio.eb_url, web_queries, do_refresh)
bio.eb_text, bio.eb_wc = get_eb_text_wc(bio.eb_html,
web_queries, do_recurse, do_refresh)
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_pronouns(bio.eb_text)
break
else:
info(" not using %s" %result['url'])
if opts.gender and bio.gender == 'unknown':
bio.gender = guess_gender_name(name)
if bio.wp_wc and bio.eb_wc:
bio.wp_ratio = bio.wp_wc / float(bio.eb_wc)
else:
bio.wp_ratio = 0
bios[name] = bio
return bios
###############
# Options and source file
###############
if __name__=='__main__':
opt_parser = OptionParser(usage="usage: %prog [options] file")
opt_parser.add_option("-a", "--ambiguous",
action="store_true", default=False,
help="don't disambuate WP pages w/ disambigbox")
opt_parser.add_option("-g", "--gender",
action="store_true", default=False,
help="perform gender analysis")
opt_parser.add_option('-l', '--log-to-file',
action="store_true", default=False,
help="log to file comp-topics.log")
opt_parser.add_option("-n", "--no-dates",
action="store_true", default=False,
help="don't use source birth/death dates")
opt_parser.add_option("-f", "--fast-cache",
action="store_true", default=False,
help="use cache if available, else query Web")
opt_parser.add_option("-c", "--cache-only",
action="store_true", default=False,
help="cache only, do no Web queries")
opt_parser.add_option("-r", "--refresh",
help="refresh a particular name in bios cache",
metavar="NAME")
opt_parser.add_option("-d", "--delete",
help="delete a particular name from bios cache",
metavar="NAME")
opt_parser.add_option("-t", "--text-include",
action="store_true", default=False,
help="include text of article in results {{copyvio}}")
opt_parser.add_option("-e", "--export-csv",
action="store_true", default=False,
help="export a comma seperated file in addition to HTML")
opt_parser.add_option('-v', '--verbose',
action='count',
help="increase verbosity (specify multiple times for more)")
opts, args = opt_parser.parse_args()
if opts.refresh:
opts.refresh = unicode(opts.refresh.decode('utf-8'))
if opts.log_to_file:
log_dest = open('topics-comp.log', 'w')
else:
log_dest = sys.stderr
log_level = 100 # default
if opts.verbose == 1: log_level = logging.CRITICAL # DEBUG
elif opts.verbose == 2: log_level = logging.INFO
elif opts.verbose >= 3: log_level = logging.DEBUG
logging.basicConfig(stream = log_dest, level=log_level,
format = "%(levelno)s %(funcName).5s: %(message)s")
critical = logging.critical
info = logging.info
dbg = logging.debug
source_fn = args[0]
source_fn_base = source_fn.split('.')[0]
data = codecs.open(source_fn, 'r', 'utf-8').readlines()
people_url = data[0].strip() # first line in url of source
people = [p.strip() for p in data[1:]]
bios = build_bios(source_fn_base, people)
create_html_report(source_fn_base, bios, people, people_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment