reagle/gist:5591363

## gistfile1.py
#!/usr/bin/python2.6
# -*- coding: utf-8 -*-
# (c) Copyright 2011-2013 by Joseph Reagle
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html>

import codecs
from difflib import SequenceMatcher
from cfiledict import FileDict # compressed FileDict
#from filedict import FileDict # http://erezsh.wordpress.com/2009/05/24/filedict-a-persistent-dictionary-in-python/
import logging
import lxml
import lxml.etree
from lxml.html.clean import clean_html
from lxml.html import builder as E # http://effbot.org/zone/element-builder.htm
from lxml.html import fragment_fromstring # http://codespeak.net/lxml/lxmlhtml.html
from StringIO import StringIO
from optparse import OptionParser
import random
import re
from rpy import r
import simplejson
import sys
import time
import unicodedata
import urllib
import urllib2
from web_little import get_HTML # http://bitbucket.org/reagle/thunderdell/src/tip/web.py

from os import environ
HOME = environ['HOME']

#import socket
#socket.setdefaulttimeout(20)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

###############
# Exceptions
###############

class FailedGet(Exception):
    '''Exception for when one can't return something.'''
    pass

###############
# Maths
###############

def L_ratio(seq1, seq2):
    '''Sugar function for difflib.SequenceMatcher.ratio()'''
    dbg("  comparing '%s' w/ '%s'" %(seq1, seq2))
    return round(SequenceMatcher(None, seq1, seq2).ratio(), 2)

###############
# String functions
###############

from htmlentitydefs import name2codepoint
name2codepoint['#39'] = 39 # python 2.5.2 doesn't have apostrophe
def unescape(s):
    """Unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml."""
    return re.sub('&(%s);' % '|'.join(name2codepoint),
        lambda m: unichr(name2codepoint[m.group(1)]), s)

def strip_accents(text):
    """Test if ascii, if not, remove accents.

    """
    #>>> strip_accents(u'nôn-åscîî') # fails because of doctest bug
    #u'non-ascii'

    def not_combining(char):
        return unicodedata.category(char) != 'Mn'

    try:    # test if ascii
        text.encode('ascii')
    except UnicodeEncodeError:
        normalized_text = unicodedata.normalize('NFD', text)
        result_text = filter(not_combining, normalized_text)
    else:
        result_text = text

    result_text = result_text.replace('-', ' ').replace('"','').strip()
    return result_text

def are_similar(bio, result):
    """Compare list title with returned titles for relatedness."""
    title = result['titleNoFormatting'].split(' -')[0].strip() # remove encyclopedia
    for name in bio.name.split(' OR '):
        if '(' not in name: # if no paren in name, remove paren from title
            title = title.split('(')[0]
        if ', ' in title:   # Britannica sometimes uses Last, First
            if len(title.split(',')) == 2:
                last, first = title.split(', ')
                title = first + ' ' + last
        dbg(" comparing %s" %name)
        name_chars = unescape(strip_accents(name)).lower()
        title_chars = unescape(strip_accents(title)).lower()
        name_wrds = name_chars.split()
        title_wrds = title_chars.split()
        name_set = set(name_wrds)
        title_set = set(title_wrds)

        if (name_set.issubset(title_set) or title_set.issubset(name_set)) and \
            (len(title_set) > 1 and 'surname' not in title_set):
                dbg("  PASSED sup/sub %s" % title_set)
                return True
        if len(name_wrds) == 1 or len(title_wrds) == 1:
            words_threshold = 0.68  # 0.67
            chars_threshold = 0.77  # 0.68
        else:
            if len(name_wrds) == 2:
                words_threshold = 0.62
                chars_threshold = 0.71 # 0.69
            if len(name_wrds) == 3:
                words_threshold = 0.62
                chars_threshold = 0.70 # 0.69
            if len(name_wrds) >= 4:
                words_threshold = 0.45 # 0.40
                chars_threshold = 0.60
            if len(name_wrds) >= 5: # large name have titles often out of order
                name_wrds = sorted(name_wrds)
                title_wrds = sorted(title_wrds)
                name_chars = ' '.join(name_wrds)
                title_chars = ' '.join(title_wrds)
                words_threshold = 0.81
                chars_threshold = 0.80

        words_ratio = L_ratio(name_wrds, title_wrds)
        if words_ratio >= words_threshold:
            dbg(u"   PASSED words %2.2f >= %2.2f '%s'" % (
                words_ratio, words_threshold, title_wrds))
            return True
        else:
            dbg(u"   FAILED words %2.2f < %2.2f '%s'" % (
                words_ratio, words_threshold, title_wrds))
            chars_ratio = L_ratio(name_chars, title_chars)
            if chars_ratio >= chars_threshold:
                dbg(u"   PASSED chars %2.2f >= %2.2f '%s'" % (
                    chars_ratio, chars_threshold, title_chars))
                return True
            else:
                dbg(u"   FAILED chars %2.2f < %2.2f '%s'" % (
                    chars_ratio, chars_threshold, title_chars))
    return False

def split_name_date(person):
    """Some source lists have optional dates for query, split them up"""
    dbg("person = '%s'" % person)
    name = born = died = ''
    if '<' in person:
        name, date = person.rsplit(' <', 1)
        born, died = date[:-1].split(' - ')
    else:
        name = person
    if opts.no_dates:
        born = died = ''
    return name, born, died

###############
# Create report
###############

def create_html_report(source_fn_base, bios, people, people_url):
    """Return HTML report of topical encyclopedic converage."""
    critical("performing analysis for report")

    report_f = codecs.open(source_fn_base + '.html', 'w', 'UTF-8', 'replace')
    if opts.export_csv:
        csv_f = codecs.open(source_fn_base + '.csv', 'w', 'UTF-8', 'replace')

    DOCTYPE = u'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">'
    html = E.HTML(
        E.HEAD(
            E.TITLE("%s Biographies" % source_fn_base),
            E.LINK(rel="stylesheet", type="text/css",
                href="../../../2001/reagle.css"),
            E.STYLE(
            ".male {background-color: #f2f2ff}"
            ".female {background-color: #fff2fd}"
            ".unknown {background-color: #f4fff2}", type="text/css")
        ))

    # for R dataframe
    genders = []
    wp_sizes = []
    eb_sizes = []

    for person in people:
        key, dummy, dummy = split_name_date(person)
        bio = bios[key]
        genders.append(bio.gender)
        wp_sizes.append(bio.wp_wc)
        eb_sizes.append(bio.eb_wc)

    # Statistical analysis
    r.assign('refwork', r.data_frame(gender=genders, wp_size=wp_sizes, eb_size=eb_sizes))
    SRCmale = r('sum(refwork$gender == "male")')
    SRCfem = r('sum(refwork$gender == "female")')
    SRCun = r('sum(refwork$gender == "unknown")')
    WPmale = r('sum(refwork$gender == "male" & refwork$wp.size != 0)')
    WPfem = r('sum(refwork$gender == "female" & refwork$wp.size != 0)')
    EBmale = r('sum(refwork$gender == "male" & refwork$eb.size != 0)')
    EBfem = r('sum(refwork$gender == "female" & refwork$eb.size != 0)')
    SRCgendered = r('sum(refwork$gender != "unknown")')
    WPgendered = r('sum(refwork$gender != "unknown" & refwork$wp.size != 0)')
    EBgendered = r('sum(refwork$gender != "unknown" & refwork$eb.size != 0)')

    inSRC = r('length(refwork$gender)')
    missingEB = r('sum(refwork$eb.size == 0)')
    missingWP = r('sum(refwork$wp.size == 0)')
    missingBoth = r('sum(refwork$eb.size == 0 & refwork$wp.size == 0)')

    wp_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
        for k,v in r('summary(refwork$wp.size[refwork$wp.size != 0], digits=5)').items()))
    eb_existing_summary = ' '.join((' %s %5.0f ' % (k, v)
        for k,v in r('summary(refwork$eb.size[refwork$eb.size != 0], digits=5)').items()))

    wp_median = r('median(refwork$wp.size[refwork$wp.size != 0])')
    eb_median = r('median(refwork$eb.size[refwork$eb.size != 0])')
    mutual_wp_median = r('median(refwork$wp.size[refwork$wp.size != 0 & '
        'refwork$eb.size != 0])')
    mutual_eb_median = r('median(refwork$eb.size[refwork$eb.size != 0 & '
        'refwork$wp.size != 0])')

    body = E.BODY(
            E.H1("%s Biographies" % source_fn_base)
            )
    html.append(body)

    body.extend((
        E.P(
        "Missing: WP = %d ; EB = %d ; neither = %d . "
        %(missingWP, missingEB, missingBoth)),
        )
    )

    if opts.gender:
        gender_text = (
            "<p>Of %d entries: I guess that %d are <span class='female'>female</span>, "
            "%d are <span class='male'>male</span> "
            "and %d are <span class='unknown'>unknown</span>. "
            "That is, females are %0.2f of the gender-known population. "
            "Of the Wikipedia articles, females are %0.2f (%d/(%d+%d)); "
            "and %0.2f (%d/(%d+%d)) at Britannica. </p>"
            % (
            inSRC, SRCfem,
            SRCmale,
            SRCun,
            float(SRCfem)/(SRCgendered),
            float(WPfem)/(WPgendered), WPfem, WPmale, WPfem,
            float(EBfem)/(EBgendered), EBfem, EBmale, EBfem
            ))
        body.append(fragment_fromstring(gender_text))

    body.extend((
        E.P(
        "Existing median word count (for articles that exist in each work alone): WP = %0.0f ; EB = %0.0f . "
        "WP median article size is roughly %0.1f times larger."
        %(wp_median, eb_median, wp_median/eb_median) ),

        E.P("Five figure summaries for existing articles."),

        E.PRE(
        "WP: %s \n"
        "EB: %s "
        %(wp_existing_summary, eb_existing_summary) ),
        E.P(
        "Mutual median word count (for articles that exist in both works only):, WP median = %0.0f words, EB = %0.0f . "
        "Mutual WP median article size is roughly %0.1f times larger."
        %(mutual_wp_median, mutual_eb_median, mutual_wp_median/mutual_eb_median) ),
    ))

    r('png(file="%s.png")' % source_fn_base) # ,width=733,height=550
    r('boxplot(refwork$wp.size[refwork$wp.size != 0], refwork$eb.size[refwork$eb.size != 0], '
        'names = c("Wikipedia", "Britannica"), '
        'main = "Existing article word counts")' )
    body.append(
        E.P(
            E.IMG(alt="size distribution", src="%s.png" % source_fn_base)
        )
    )

    critical("generating table")
    table = E.TABLE(
                E.COL(), E.COL(E.CLASS('col-alt')), E.COL(),
                E.COL(E.CLASS('col-alt')), E.COL(),
                E.THEAD(E.TR(E.TH('Name'),
                    E.TH('Wikipedia', width="35%"), E.TH('Words'),
                    E.TH('Britannica', width="35%"), E.TH('Words'))),
            width="100%", cellpadding="5", border="1")
    #if opts.export_csv:
        #csv_f.write('source, name, gender, born, died, list, count \n')

    for person in people:
        key, dummy, dummy = split_name_date(person)
        bio = bios[key]
        table.append(
            E.TR(
                E.TD(E.A(bio.name, href='%s' % (people_url)), # , bio.last_name
                    E.CLASS(bio.gender)),
                E.TD(E.A(bio.wp_title, href=bio.wp_purl), E.CLASS(bio.gender)),
                E.TD(str(bio.wp_wc)), # size
                E.TD(E.A(bio.eb_title, href=bio.eb_url), E.CLASS(bio.gender)),
                E.TD(str(bio.eb_wc)), # size
                valign = "center",
                ))
        if opts.text_include:
            table.append(
            E.TR(
                E.TD(''),
                E.TD(colspan='2', *[E.P(p) for p in bio.wp_text.split('\n')]),
                E.TD(colspan='2', *[E.P(p) for p in bio.eb_text.split('\n')]),
                valign="top",
                ),
            )
        if opts.export_csv:
            for source in source_fn_base.split('-'):
                if source.isalpha():
                    break
            csv_f.write(u'%s;%s;%s;%s;%s;WP;%s\n' %
                (source, bio.name, bio.gender, bio.born, bio.died, bio.wp_wc) )
            csv_f.write(u'%s;%s;%s;%s;%s;EB;%s\n' %
                (source, bio.name, bio.gender, bio.born, bio.died, bio.eb_wc) )

    body.append(table)

    report_f.write(DOCTYPE + lxml.html.tostring(html, pretty_print=True))
    report_f.close()
    if opts.export_csv:
        csv_f.close()


###############
# Web
###############

def url_OK(url):
    '''Check if URL is in right Web space and with textual extension.'''
    dbg("    testing %s" % url)
    is_ok = True
    BAD_EXTENSIONS = ('doc', 'pdf', 'jpg', 'png', 'gif') # Google
    BAD_WP_NAMESPACES = ('/Wikiquote:', '/Wikisource:')
    if url.split('.')[-1].lower() in BAD_EXTENSIONS:
        is_ok = False
    if 'britannica.com' in url:
        if '/EBchecked/topic/' not in url:
            is_ok = False
    if 'wikipedia.org' in url:
        if '(disambiguation)' in url: # WP
            is_ok = False
        if any([ns in url for ns in BAD_WP_NAMESPACES]):
            is_ok = False
    dbg("    url_OK = %s" % is_ok)
    return is_ok

def query_google(query, google_queries, do_refresh, retry_counter=0):
    """Return ordered results from Google API via google_queries cache."""

    info("    testing google_queries for '%s'" % query)
    results = google_queries.get(query, None)
    if do_refresh or (results is None and not opts.cache_only):
        info("     refresh or g_cache MISS for %s" %query)
        url = "http://ajax.googleapis.com/ajax/services/" + \
            "search/web?start=0&v=1.0&%s" % (    # &rsz=large for 8 results
            urllib.urlencode({'q': query.encode('utf-8')}))
        request = urllib2.Request( url, None, {'Referer': 'http://reagle.org/joseph/'})
        try:
            search_results = urllib2.urlopen(request)
            json = simplejson.loads(search_results.read())
            dbg("      responseStatus = %s" %json['responseStatus'])
            if json['responseStatus'] == 404:
                raise FailedGet("Google API: %s" %json['responseDetails'])
            results = json['responseData']['results']
            info("      got results")
        except (urllib2.URLError) as e: # TypeError,
            info("      retrying query_google after %s" %e)
            time.sleep((retry_counter * 10 + 5)) # pause before retying
            results = query_google(query, google_queries, do_refresh, retry_counter + 1)
        google_queries[query] = results
    if retry_counter == 2:
        raise FailedGet("Failed to get after 3 attempts.")
    if results is None and opts.cache_only:
        raise FailedGet("No cache result found with cache-only option.")
    info("      returning results")
    return results

def query_web(url, web_queries, do_refresh):
    """Grab Web page (encyclopedic article) from URL via web_queries cache."""
    url = url.rsplit('#')[0] # don't bother with fragments
    info("    testing web_queries for '%s'" % url)
    dbg("    do_refresh = %s, opts.refresh = %s opts.cache_only = %s" %
        (do_refresh, opts.refresh, opts.cache_only))
    html = web_queries.get(url, None)
    if do_refresh or (html is None and not opts.cache_only):
        dbg("    retrieving from Web")
        time.sleep(random.randint(1, 3))
        html, response = get_HTML(url)
        web_queries[url] = html
    else:
        dbg("    found in web_queries cache")
    if html is None and cache_only:
        raise FailedGet("No cache result found with cache-only option.")
    return html

def get_text(content_node):
    '''Return textual content of nodes sans elements.'''
    content_text = ''.join(content_node.xpath("descendant-or-self::text()"))
    return content_text

def remove_node(node):
    '''Remove a node from a tree.'''
    parent = node.getparent()
    parent.remove(node)

def get_wp_text_wc(html):
    '''Return content of article in textual form without miscellany.'''
    def trim_wp(node):
        '''To keep analysis comparable, remove most everything but article prose.'''
        remove_node(node[0]) # "From Wikipedia, the free encyclopedia"
        remove_node(node[1]) # Jump to:navigation,
        try:
            for n in node.xpath('//table'):
                remove_node(n)
        except IndexError:
            pass
        EXCLUDED_SECTIONS = ('sSee_also', 'Further_Reading', 'Further_reading',
            'Notes', 'External_Links', 'References', 'External_links'
            'Footnote', 'Footnotes', 'Sources', 'Notes_and_sources',
            'Bibliography', 'Notes_and_references',
            'Works', 'Publications',
            'List_of_works', 'Selected_works', 'Novels',
            'Selected_bibliography', 'Publications_and_speeches',
            'Records', 'Other_Records', 'Career_statistics','Teams_and_victories',
            'Professional_wins',
            'Accomplishments', 'Championships_and_accomplishments',
            'Records_and_achievements', 'Major_Projects', 'Discography',
            'Original_Compositions', 'Videography', 'Notable_works',
            'Filmography', 'Selected filmography', 'Partial_filmography',
            'Music_videography', 'Books',
            'Achievements', 'Notable_roles',
            'Television_appearances', 'In_popular_culture',
            'Popular_culture',
            'Titles_and_honours', 'Honours', 'Awards', 'Awards_and_fellowships',
            'Awards_and_nominations', 'Career_highlights',
            'Awards_and_honors', 'Recognition', 'Selected_awards', 'Other honors',
            'Honors_and_awards', 'Awards_and_recognitions',
            'Titles.2C_styles_and_honours', 'Honorary_degrees', 'Prizes',
            )
        for misc in EXCLUDED_SECTIONS:
            try:
                misc_node = node.xpath('//span[@id="%s"]/ancestor::h2' % misc)[0]
                [remove_node(n) for n in misc_node.xpath('./following-sibling::*')]
                remove_node(misc_node)
            except IndexError: # in case no node found
                pass
        return node

    html = clean_html(html)
    html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
            remove_pis = True, strip_cdata = True)
    doc = etree.parse(StringIO(html), html_parser)

    div = doc.xpath("//div[@id='bodyContent']")[0]
    #dbg("      div = %s" % ('\n'.join(n.tag for n in div[1:10])))
    #dbg(lxml.html.tostring(div, pretty_print=True))
    div = trim_wp(div)
    text = get_text(div)
    text = text.replace('[edit]', '')
    return text, len(text.split())

def get_eb_text_wc(html, web_queries, do_recurse, do_refresh):
    '''Return content of article in textual form without miscellany.

    Will follow links if necessary since EB pages are dynamic.

    '''
    def trim_eb(node):
        '''Trim material following the citations.'''
        try:
            misc_node = node.xpath('//h2[text() = "Citations"]')[0]
            dbg("      misc_node %s" %misc_node)
            [remove_node(n) for n in misc_node.xpath('./following::*')]
            remove_node(misc_node)
            dbg("          trimmed it!")
        except IndexError: # in case no node found
            pass
        return node

    html_parser = etree.HTMLParser(remove_blank_text = True, remove_comments = True,
            remove_pis = True, strip_cdata = True)
    doc = etree.parse(StringIO(html), html_parser)
    if do_recurse: # EB articles might have other sections to fetch
        EXCLUDED_SECTIONS = (
            "Major Works", "Additional Reading", "Biographies",
            "Critical studies", "Related Articles", "Supplemental Information",
            "Quotations", "Spotlights", "External Web sites", "Citations",
            "Year in Review Links",)
        info("      YES recurse")
        text = ''

        # Find links to other sections
        toc_options = doc.xpath("//div[@id='bps-article-toc']/select/option")
        info("toc_options = %s" % toc_options)
        toc_options.pop(1) # skip the second since its included in Main
        dbg("     toc_options %s" %toc_options)
        for opt in toc_options:
            opt_url = 'http://www.britannica.com' + opt.get('value')
            if opt.get('title') not in EXCLUDED_SECTIONS and '#' not in opt_url:
                dbg("      checking %s" %opt.get('title'))
                dbg("      opt_url %s" %opt_url)
                html = query_web(opt_url, web_queries, do_refresh)
                html_parser = etree.HTMLParser(remove_comments = True)
                doc = etree.parse(StringIO(html), html_parser)
                div = doc.xpath('//div[@class="KonaBody"]') # /ancestor::div
                if div:
                    div = trim_eb(div[0])
                    text += get_text(div)
                else:
                    dbg("      didn't find content in %s" %opt.get('title'))
                    dbg(html)

    else:
        dbg("      NO recurse")
        div = doc.xpath('//div[@id="bps-left-article-wrapper"]')[0]
        div = trim_eb(div)
        dbg("      div %s" %div)
        text = get_text(div)
    return text, len(text.split())

wgCurRevisionId_regexp = re.compile('wgCurRevisionId=(\d+)')
oldid_regexp = re.compile('oldid=(\d+)')
def get_wp_oldid(wp_html):
    '''Return oldid for construction of permanent URI.'''
    wgCurRevisionId = wgCurRevisionId_regexp.search(wp_html)
    info("wgCurRevisionId.groups() = %s" %wgCurRevisionId.groups())
    if wgCurRevisionId:
        return wgCurRevisionId.groups()[0]
    else:
        oldid = oldid_regexp.search(wp_html)
        if oldid:
            info("oldid.groups() = %s" %oldid.groups())
            return oldid.groups()[0]
        else:
            critical("No oldid found")
            raise FailedGet

def get_wp_purl(bio):
    '''Return formatted permanent URI.'''
    root_url = u'http://en.wikipedia.org/w/index.php?title='
    return root_url + bio.wp_url.split('/')[-1] + '&oldid=' + bio.wp_oldid

###############
# Gender guessing functions
###############

he_re = re.compile(r'\b([Hh]is|[Hh]e)\b')
she_re = re.compile(r'\b([Hh]er|[Ss]he)\b')
def guess_gender_pronouns(text):
    '''Guess gender based on proportion of pronouns.'''
    info("guessing gender via pronouns")
    she = len(she_re.findall(text)) if she_re.findall(text) else 0
    he = len(he_re.findall(text)) if he_re.findall(text) else 0
    diff = abs(she - he)/(he + she + 0.1)
    dbg("he = %d, she = %d, diff = %f" %(he, she, diff))
    if diff < 0.25:
        gender = 'unknown'
    elif she > he:
        gender = 'female'
    else:
        gender = 'male'
    dbg("gender = %s" %gender)

    return gender


def create_name_dict(fn):
    '''Utility function to build dictionaries of name frequencies.'''
    d = {}
    names = open(fn).readlines()
    for line in names[1:]:
        if ' na ' not in line:
            name, frequency, number, rank = line.split()
            d[name] = {}
            d[name]['freq'] = float(frequency)
    return d


FEMALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-female.csv')
MALE_NAMES = create_name_dict(HOME+'/joseph/2010/03/names-male.csv')

HONORIFIC_MALE = [honor + ' ' for honor in
            ('Baron', 'Brother', 'Comte', 'Count', 'Duc', 'Duke', 'Earl of',
            'Father', 'Marquess', 'Marquis', 'Prince', 'Sir',
            'Viscount', 'Vicomte' )]
HONORIFIC_FEMALE = [honor + ' ' for honor in
            ('Baroness', 'Comtesse', 'Countess', 'Dame',
            'Marchioness', 'Marquise', 'Princess', 'Sister', 'Queen')]

def guess_gender_name(name):
    '''Guess the gender based only on the name using honorifics and
    statistical tables of name frequencies
    >>> guess_gender_name('Joseph Reagle')
    'male'
    >>> guess_gender_name('Sir Jehne Smith')
    'male'
    >>> guess_gender_name('Dame Jijij Foo')
    'female'

    '''
    info("guessing gender via names")
    gender = 'unknown'
    if any([honor in name for honor in HONORIFIC_MALE]):
        gender = 'male'
    elif any([honor in name for honor in HONORIFIC_FEMALE]):
        gender = 'female'

    if gender == 'unknown':
        name = name.replace('Dr. ', '')
        given = name.split()[0].upper()
        if given in FEMALE_NAMES and given in MALE_NAMES:
            dbg("freqs = %f %f" % (MALE_NAMES[given]['freq'], FEMALE_NAMES[given]['freq']) )
            if MALE_NAMES[given]['freq'] > 4 * FEMALE_NAMES[given]['freq']:
                gender = 'male'
            elif FEMALE_NAMES[given]['freq'] > 4 * MALE_NAMES[given]['freq']:
                gender = 'female'
        else:
            if given in MALE_NAMES:
                gender = 'male'
            if given in FEMALE_NAMES:
                gender = 'female'

    info("gender = %s" %gender)
    return gender

###############
# Biography class
###############

class Biography():
    name = ''
    last_name = ''
    born = ''
    died = ''
    gender = 'unknown'
    wp_title = ''
    wp_url = ''
    wp_purl = ''
    wp_html = ''
    wp_text = ''
    wp_wc = 0
    wp_ratio = 0
    wp_oldid = ''
    eb_title = ''
    eb_url = ''
    eb_html = ''
    eb_text = ''
    eb_wc = 0

    def __init__(self, name, born = '', died = ''):
        self.name = name
        self.last_name = name.split(' OR ')[0].split('(')[0].split()[-1] # use first last name
        self.born = born
        self.died = died

    def __str__(self):
        #showList = ["a", "b"]
        showList = sorted(set(self.__dict__))
        return ("X(%i):\n" % id(self)) + "\n".join(["  %s: %s" % (
            key.rjust(8), self.__dict__[key]) for key in showList])

###############
# Web scrape and build bios
###############

def build_bios(people_proj, people):
    """Build biography for each person including EB/WP information."""

    bios = FileDict(filename='cache/bios-%s' % people_proj + '.db')
    google_queries = FileDict(filename='cache/google-%s' % people_proj + '.db')
    web_queries = FileDict(filename='cache/web-%s' % people_proj + '.db')

    if opts.delete:
        del bios[opts.delete]
        return bios

    CUTOFF = 1900

    for person in people:
        name, born, died = split_name_date(person)
        critical("** Checking '%s' <%s - %s> **" % (name, born, died))
        bio = Biography(name, born, died)
        if opts.refresh and bio.name in opts.refresh: # so I can pass in more than one name
            do_refresh = True
        else:
            do_refresh = False
            if name in bios:
                if opts.fast_cache or opts.cache_only:
                    info("   '%s' is duplicated, skipping" % name)
                    continue
            else:
                if opts.cache_only:
                    critical("No cached bio found for '%s'" %name)
                    raise FailedGet

        info("WIKIPEDIA")
        site = 'en.wikipedia.org'
        if born and int(born) < CUTOFF:
            query = 'site:%s (%s)' % (site, name)
        else:
            query = 'site:%s (%s %s)' % (site, name, born)
        dbg(u"  query = %s" % query)
        results = query_google(query, google_queries, do_refresh)
        info("  results %s" % [r['titleNoFormatting'] for r in results])
        for result in results:
            if url_OK(result['url']) and are_similar(bio, result):
                info("   using %s" %result['url'])
                bio.wp_title = unescape(result['titleNoFormatting'] \
                    .split(' -')[0])
                bio.wp_url = urllib2.unquote(result['url'].encode()).decode(
                    'utf-8', 'replace')
                bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
                #if not opts.ambiguous and \
                        #('disambigbox' in bio.wp_html or 'setindexbox' in bio.wp_html):
                DISAMBIG = ('disambigbox', 'setindexbox')
                if not opts.ambiguous and any([disambig in bio.wp_html for disambig in DISAMBIG]):
                    critical("disambiguating %s at %s" % (bio.wp_title, bio.wp_url))
                    bio.wp_title = bio.wp_title + ' [disambiguated]'

                    parser_dbox = etree.HTMLParser(remove_comments = True)
                    doc_dbox = etree.parse(StringIO(bio.wp_html), parser_dbox)
                    url_dbox = 'http://en.wikipedia.org' + doc_dbox.xpath(
                        "//div[@id='bodyContent']/ul//a[not(@class='new')]/@href")[0]
                    critical("url_dbox %s" % url_dbox)
                    bio.wp_url = url_dbox
                    bio.wp_html = query_web(bio.wp_url, web_queries, do_refresh)
                #info("   bio.wp_html %s" %bio.wp_html)
                bio.wp_oldid = get_wp_oldid(bio.wp_html)
                bio.wp_purl = get_wp_purl(bio)
                bio.wp_text, bio.wp_wc = \
                    get_wp_text_wc(bio.wp_html)
                if opts.gender and bio.gender == 'unknown':
                    bio.gender = guess_gender_pronouns(bio.wp_text)
                break
            else:
                info("   not using %s" %result['url'])

        info("BRITANNICA")
        site = 'www.britannica.com' # /EBchecked/topic/
        if born and int(born) < CUTOFF:
            query = 'site:%s (%s)' % (site, name)
        else:
            query = 'site:%s (%s %s)' % (site, name, born)
        info("  query = %s" %query)
        results = query_google(query, google_queries, do_refresh)
        info("  results %s" %[r['titleNoFormatting'] for r in results])
        for result in results:
            if url_OK(result['url']) and are_similar(bio, result):
                info("   using %s" %result['url'])
                do_recurse = False
                bio.eb_title = \
                    unescape(result['titleNoFormatting'].split(' --')[0])
                bio.eb_url = urllib2.unquote(result['url'].encode())
                if bio.eb_url.split('/')[-3] == 'topic':
                    do_recurse = True    # get subsections
                bio.eb_html = query_web(bio.eb_url, web_queries, do_refresh)
                bio.eb_text, bio.eb_wc = get_eb_text_wc(bio.eb_html,
                    web_queries, do_recurse, do_refresh)
                if opts.gender and bio.gender == 'unknown':
                    bio.gender = guess_gender_pronouns(bio.eb_text)
                break
            else:
                info("   not using %s" %result['url'])

        if opts.gender and bio.gender == 'unknown':
            bio.gender = guess_gender_name(name)
        if bio.wp_wc and bio.eb_wc:
            bio.wp_ratio = bio.wp_wc / float(bio.eb_wc)
        else:
            bio.wp_ratio = 0

        bios[name] = bio

    return bios

###############
# Options and source file
###############

if __name__=='__main__':

    opt_parser = OptionParser(usage="usage: %prog [options] file")
    opt_parser.add_option("-a", "--ambiguous",
                    action="store_true", default=False,
                    help="don't disambuate WP pages w/ disambigbox")
    opt_parser.add_option("-g", "--gender",
                    action="store_true", default=False,
                    help="perform gender analysis")
    opt_parser.add_option('-l', '--log-to-file',
                    action="store_true", default=False,
                    help="log to file comp-topics.log")
    opt_parser.add_option("-n", "--no-dates",
                    action="store_true", default=False,
                    help="don't use source birth/death dates")
    opt_parser.add_option("-f", "--fast-cache",
                    action="store_true", default=False,
                    help="use cache if available, else query Web")
    opt_parser.add_option("-c", "--cache-only",
                    action="store_true", default=False,
                    help="cache only, do no Web queries")
    opt_parser.add_option("-r", "--refresh",
                    help="refresh a particular name in bios cache",
                    metavar="NAME")
    opt_parser.add_option("-d", "--delete",
                    help="delete a particular name from bios cache",
                    metavar="NAME")
    opt_parser.add_option("-t", "--text-include",
                    action="store_true", default=False,
                    help="include text of article in results {{copyvio}}")
    opt_parser.add_option("-e", "--export-csv",
                    action="store_true", default=False,
                    help="export a comma seperated file in addition to HTML")
    opt_parser.add_option('-v', '--verbose',
                    action='count',
                    help="increase verbosity (specify multiple times for more)")
    opts, args = opt_parser.parse_args()
    if opts.refresh:
        opts.refresh = unicode(opts.refresh.decode('utf-8'))
    if opts.log_to_file:
        log_dest = open('topics-comp.log', 'w')
    else:
        log_dest = sys.stderr
    log_level = 100 # default
    if opts.verbose == 1: log_level = logging.CRITICAL # DEBUG
    elif opts.verbose == 2: log_level = logging.INFO
    elif opts.verbose >= 3: log_level = logging.DEBUG
    logging.basicConfig(stream = log_dest, level=log_level,
        format = "%(levelno)s %(funcName).5s: %(message)s")
    critical = logging.critical
    info = logging.info
    dbg = logging.debug

    source_fn = args[0]
    source_fn_base = source_fn.split('.')[0]
    data = codecs.open(source_fn, 'r', 'utf-8').readlines()

    people_url = data[0].strip()    # first line in url of source
    people = [p.strip() for p in data[1:]]
    bios = build_bios(source_fn_base, people)
    create_html_report(source_fn_base, bios, people, people_url)