alanbernstein/phylo.py

## phylo.py
#!/usr/local/bin/python
import sys
import wikipedia
import requests
import re
from pprint import pprint

# TODO: cache results and use cache to build up a tree
# TODO: handle disambiguation page (e.g., for 'orange')

# alternately could use this http://www.itis.gov/index.html but requires
# a lot more work to decide which search result to use
#
# https://en.wikipedia.org/wiki/Horse
# https://en.wikipedia.org/wiki/Template:Taxobox#Classification
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects

ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus',
         'species']

ranks_english = {'regnum': 'kingdom',
                 'phylum': 'phylum',
                 'classis': 'class',
                 'ordo': 'order',
                 'subordo': 'suborder',
                 'familia': 'family',
                 'genus': 'genus',
                 'species': 'species'}


def main():

    if len(sys.argv) > 2:
        searchterm1 = sys.argv[1]
        searchterm2 = sys.argv[2]
    else:
        searchterm1 = 'dolphin'
        searchterm2 = 'blue whale'
    print(searchterm1, searchterm2)

    tax1 = {'common name': searchterm1}
    tax2 = {'common name': searchterm2}

    taxobox1 = get_taxobox_from_search_term(searchterm1)
    if taxobox1:
        tax1 = get_taxonomy_from_taxobox(taxobox1, tax1)
    else:
        print('unable to retrieve info for %s' % tax1['common name'])

    taxobox2 = get_taxobox_from_search_term(searchterm2)

    if taxobox2:
        tax2 = get_taxonomy_from_taxobox(taxobox2, tax2)
    else:
        print('unable to retrieve info for %s' % tax2['common name'])

    if taxobox1 and taxobox2:
        print_taxonomy([tax1, tax2])
        get_lowest_common_node(tax1, tax2)


def get_taxonomy(search_term):
    tax = {'common name': search_term}
    taxobox = get_taxobox_from_search_term(search_term)
    if taxobox:
        tax = get_taxonomy_from_taxobox(taxobox, tax)
    else:
        print('unable to retrieve info for %s' % tax['common name'])

    # needs to return taxobox and tax - seems like rewrite


def get_taxobox_from_search_term(term, d=0):
    # idea: given a search term, try to get the "taxobox" infobox about the
    # living thing described by that term. several possibilities:
    # - page matching search term is the desired page, and has a taxobox
    #   - done, extract taxobox contents and pass on
    # - page matching search term has no taxobox
    #   - search for taxonomy link, repeat (link preceded by 'species', 'genus', ...
    #     https://en.wikipedia.org/wiki/Salmon
    #     https://en.wikipedia.org/wiki/Hazelnut
    # - page is a redirect
    #     https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean)
    # - page contains taxobox, but taxobox is a template itself
    #     https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae#
    #
    # todo: record which of these cases occur for which terms
    query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects'

    resp = requests.get(query)
    content = resp.text
    taxobox = None
    print(content)

    if '#REDIRECT' in content:
        s = 'is redirect'
        # handled by additional parameter in request string

    elif 'automatic taxobox' in content.lower():
        s = 'contains taxobox template'
        # find the 'taxon' entry in the taxobox, go to it
        # example: spiders is 'taxon = Araneae' amd uses this:
        # https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae

        # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

    elif 'speciesbox' in content.lower():
        s = 'contains speciesbox template'
        # find the 'taxon' entry in the taxobox, go to it
        # example: garlic is 'taxon = Allium sativum', and uses this:
        # https://en.wikipedia.org/wiki/Template:Taxonomy/Allium

        # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

    elif 'taxobox' in content.lower():
        s = 'contains usable taxobox'
        taxobox = content
    else:
        s = 'contains no taxobox... '

        if d == 0:
            # check pages of all links for taxoboxes
            # first links preceeded by the word 'species', then 'genus',
            # then 'family', then all other links in the intro section

            # get a list of links along with the preceeding words
            p = r'[^ ]*\[\[([^]]*)\]\]'
            mat = re.finditer(p, content)
            links = []
            for m in mat:
                spind = content.rfind(' ', 0, m.start(0) - 1)
                prevword = content[spind + 1:m.start(0) - 1]
                links.append([prevword, m.group(1)])
#            pprint(links)

# check the species/genus/family links
            for prev_word in ['species', 'genus', 'family']:
                links2 = [n for n in links if prev_word in n[0]]

                if not taxobox:
                    for l in links2:
                        tb = get_taxobox_from_search_term(l[1], d + 1)
                        if tb:
                            taxobox = tb
                            s = 'retrieved info from ' + prev_word + ' link "' + l[
                                1] + '"'
                            break

            # check all other links
            if not taxobox:
                for l in links:
                    tb = get_taxobox_from_search_term(l[1], d + 1)
                    if tb:
                        taxobox = tb
                        s = 'retrieved info from link "' + l[1] + '"'
                        break

        else:
            s = 'no taxobox in any first-level links'

    if d == 0:
        print(' ' * d + term + ': ' + s)

    return taxobox


def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy):
    # for
    pass


def get_taxonomy_from_taxobox(taxobox, taxonomy):
    # dolphin:
    # | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n|

    # blue whale:
    # regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n |
    #
    #
    # todo:
    # - rank parsing issues:
    #   - use link name, not link url
    #   x remove citations
    #   x deal with non-link ranks and links with different names
    #   x deal with missing ranks
    #   - deal with sub, super, infra, ultra, unranked
    #   - deal with multiple values
    #     x just use first
    #     - get a list

    PRINT = 0
    if PRINT:
        print('')
        print(taxonomy['common name'])
        print(taxobox)

    # TODO:
    # preprocess links:
    # [[xyz|qwe abc]] -> [[qwe abc]]
    # [[qwe xyz]]abc -> [[qwe xyzabc]]
    #
    # then:
    # [[qwe xyz]] -> 'qwe xyz'

    for r in ranks:

        # extract 'row' of taxobox with this rank in it
        p = r + r'[ ]*=[^|]*\|'
        x = re.search(p, taxobox)
        if x is None:
            continue
        row = x.group(0)
        row2 = re.sub('{.*', '', row)  # remove {{}} entities

        # extract based on link - fails if entry not a link

        p = r'\[\[[^]]*\]\]'
        rnames1 = re.findall(p, row2)
        rnames1 = [s[2:-2] for s in rnames1]

        # extract based on other stuff -
        row3 = row2.replace('[[', '')
        row4 = row3.replace(']]', '')
        row5 = row4.replace('*', '')
        row6 = row5.replace("'", '')
        p = r'=[ ]*[a-zA-Z .]*'
        x = re.search(p, row6)
        rname2 = x.group(0)
        rname2 = rname2.replace('=', '').replace('\\n', '').strip()

        taxonomy[r] = rname2
        if PRINT:
            print(r, row, row6, rnames1, rname2)

    if len(taxonomy) < 2:
        print('no data found for %s' % taxonomy['common name'])
    return taxonomy


def print_taxonomy(taxa):

    s1 = '%10s  ' % ''
    s2 = ''
    for t in taxa:
        s2 = s2 + '%15s' % t['common name']
    if len(s2.strip()) > 0:
        print(s1 + s2)

    for r in ranks:
        s1 = '%10s: ' % ranks_english[r]
        s2 = ''
        for t in taxa:
            if r in t.keys():
                s2 = s2 + '%15s' % t[r]
            else:
                s2 = s2 + '%15s' % ' '

        if len(s2.strip()) > 0:
            print(s1 + s2)


def get_lowest_common_node(a, b):

    for r in reversed(ranks):
        if r in a.keys() and r in b.keys() and a[r] == b[r]:
            print('"%s" and "%s" share the same %s (%s)' %
                  (a['common name'], b['common name'], ranks_english[r], a[r]))
            break


def extract_ranks(text):
    # idea: start with a known list, including
    #   regnum, phylum, ..., genus, species
    # - split the text into lines
    # - look for lines containing these entries
    # - look at all intermediate lines
    pass


if __name__ == '__main__':
    main()
	#!/usr/local/bin/python
	import sys
	import wikipedia
	import requests
	import re
	from pprint import pprint

	# TODO: cache results and use cache to build up a tree
	# TODO: handle disambiguation page (e.g., for 'orange')

	# alternately could use this http://www.itis.gov/index.html but requires
	# a lot more work to decide which search result to use
	#
	# https://en.wikipedia.org/wiki/Horse
	# https://en.wikipedia.org/wiki/Template:Taxobox#Classification
	# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects

	ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus',
	'species']

	ranks_english = {'regnum': 'kingdom',
	'phylum': 'phylum',
	'classis': 'class',
	'ordo': 'order',
	'subordo': 'suborder',
	'familia': 'family',
	'genus': 'genus',
	'species': 'species'}


	def main():

	if len(sys.argv) > 2:
	searchterm1 = sys.argv[1]
	searchterm2 = sys.argv[2]
	else:
	searchterm1 = 'dolphin'
	searchterm2 = 'blue whale'
	print(searchterm1, searchterm2)

	tax1 = {'common name': searchterm1}
	tax2 = {'common name': searchterm2}

	taxobox1 = get_taxobox_from_search_term(searchterm1)
	if taxobox1:
	tax1 = get_taxonomy_from_taxobox(taxobox1, tax1)
	else:
	print('unable to retrieve info for %s' % tax1['common name'])

	taxobox2 = get_taxobox_from_search_term(searchterm2)

	if taxobox2:
	tax2 = get_taxonomy_from_taxobox(taxobox2, tax2)
	else:
	print('unable to retrieve info for %s' % tax2['common name'])

	if taxobox1 and taxobox2:
	print_taxonomy([tax1, tax2])
	get_lowest_common_node(tax1, tax2)


	def get_taxonomy(search_term):
	tax = {'common name': search_term}
	taxobox = get_taxobox_from_search_term(search_term)
	if taxobox:
	tax = get_taxonomy_from_taxobox(taxobox, tax)
	else:
	print('unable to retrieve info for %s' % tax['common name'])

	# needs to return taxobox and tax - seems like rewrite


	def get_taxobox_from_search_term(term, d=0):
	# idea: given a search term, try to get the "taxobox" infobox about the
	# living thing described by that term. several possibilities:
	# - page matching search term is the desired page, and has a taxobox
	# - done, extract taxobox contents and pass on
	# - page matching search term has no taxobox
	# - search for taxonomy link, repeat (link preceded by 'species', 'genus', ...
	# https://en.wikipedia.org/wiki/Salmon
	# https://en.wikipedia.org/wiki/Hazelnut
	# - page is a redirect
	# https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean)
	# - page contains taxobox, but taxobox is a template itself
	# https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae#
	#
	# todo: record which of these cases occur for which terms
	query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects'

	resp = requests.get(query)
	content = resp.text
	taxobox = None
	print(content)

	if '#REDIRECT' in content:
	s = 'is redirect'
	# handled by additional parameter in request string

	elif 'automatic taxobox' in content.lower():
	s = 'contains taxobox template'
	# find the 'taxon' entry in the taxobox, go to it
	# example: spiders is 'taxon = Araneae' amd uses this:
	# https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae

	# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

	elif 'speciesbox' in content.lower():
	s = 'contains speciesbox template'
	# find the 'taxon' entry in the taxobox, go to it
	# example: garlic is 'taxon = Allium sativum', and uses this:
	# https://en.wikipedia.org/wiki/Template:Taxonomy/Allium

	# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

	elif 'taxobox' in content.lower():
	s = 'contains usable taxobox'
	taxobox = content
	else:
	s = 'contains no taxobox... '

	if d == 0:
	# check pages of all links for taxoboxes
	# first links preceeded by the word 'species', then 'genus',
	# then 'family', then all other links in the intro section

	# get a list of links along with the preceeding words
	p = r'[^ ]\[\[([^]])\]\]'
	mat = re.finditer(p, content)
	links = []
	for m in mat:
	spind = content.rfind(' ', 0, m.start(0) - 1)
	prevword = content[spind + 1:m.start(0) - 1]
	links.append([prevword, m.group(1)])
	# pprint(links)

	# check the species/genus/family links
	for prev_word in ['species', 'genus', 'family']:
	links2 = [n for n in links if prev_word in n[0]]

	if not taxobox:
	for l in links2:
	tb = get_taxobox_from_search_term(l[1], d + 1)
	if tb:
	taxobox = tb
	s = 'retrieved info from ' + prev_word + ' link "' + l[
	1] + '"'
	break

	# check all other links
	if not taxobox:
	for l in links:
	tb = get_taxobox_from_search_term(l[1], d + 1)
	if tb:
	taxobox = tb
	s = 'retrieved info from link "' + l[1] + '"'
	break

	else:
	s = 'no taxobox in any first-level links'

	if d == 0:
	print(' ' * d + term + ': ' + s)

	return taxobox


	def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy):
	# for
	pass


	def get_taxonomy_from_taxobox(taxobox, taxonomy):
	# dolphin:
	# \| regnum = [[Animalia]]\n\| phylum = [[Chordata]]\n\| classis = [[Mammalia]]\n\| ordo = [[Cetacea]]\n\| subordo = [[Odontoceti]]\n\| familia = [[Delphinidae]]\n[[Iniidae]]\n\u2020[[Lipotidae]]\n[[Platanistidae]]\n*[[Pontoporiidae]]\n\|

	# blue whale:
	# regnum=[[Animal]]ia\n \| phylum=[[Chordata]]\n \| classis=[[Mammal]]ia\n \| ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n\| unranked_subordo = [[Cetacea]]\n\| unranked_superfamilia = [[Mysticeti]]\n \| familia=[[Balaenopteridae]]\n \| genus=''[[Balaenoptera]]''\n \| species='''''B. musculus '''''\n \|
	#
	#
	# todo:
	# - rank parsing issues:
	# - use link name, not link url
	# x remove citations
	# x deal with non-link ranks and links with different names
	# x deal with missing ranks
	# - deal with sub, super, infra, ultra, unranked
	# - deal with multiple values
	# x just use first
	# - get a list

	PRINT = 0
	if PRINT:
	print('')
	print(taxonomy['common name'])
	print(taxobox)

	# TODO:
	# preprocess links:
	# [[xyz\|qwe abc]] -> [[qwe abc]]
	# [[qwe xyz]]abc -> [[qwe xyzabc]]
	#
	# then:
	# [[qwe xyz]] -> 'qwe xyz'

	for r in ranks:

	# extract 'row' of taxobox with this rank in it
	p = r + r'[ ]=[^\|]\\|'
	x = re.search(p, taxobox)
	if x is None:
	continue
	row = x.group(0)
	row2 = re.sub('{.*', '', row) # remove {{}} entities

	# extract based on link - fails if entry not a link

	p = r'\[\[[^]]*\]\]'
	rnames1 = re.findall(p, row2)
	rnames1 = [s[2:-2] for s in rnames1]

	# extract based on other stuff -
	row3 = row2.replace('[[', '')
	row4 = row3.replace(']]', '')
	row5 = row4.replace('*', '')
	row6 = row5.replace("'", '')
	p = r'=[ ][a-zA-Z .]'
	x = re.search(p, row6)
	rname2 = x.group(0)
	rname2 = rname2.replace('=', '').replace('\\n', '').strip()

	taxonomy[r] = rname2
	if PRINT:
	print(r, row, row6, rnames1, rname2)

	if len(taxonomy) < 2:
	print('no data found for %s' % taxonomy['common name'])
	return taxonomy


	def print_taxonomy(taxa):

	s1 = '%10s ' % ''
	s2 = ''
	for t in taxa:
	s2 = s2 + '%15s' % t['common name']
	if len(s2.strip()) > 0:
	print(s1 + s2)

	for r in ranks:
	s1 = '%10s: ' % ranks_english[r]
	s2 = ''
	for t in taxa:
	if r in t.keys():
	s2 = s2 + '%15s' % t[r]
	else:
	s2 = s2 + '%15s' % ' '

	if len(s2.strip()) > 0:
	print(s1 + s2)


	def get_lowest_common_node(a, b):

	for r in reversed(ranks):
	if r in a.keys() and r in b.keys() and a[r] == b[r]:
	print('"%s" and "%s" share the same %s (%s)' %
	(a['common name'], b['common name'], ranks_english[r], a[r]))
	break


	def extract_ranks(text):
	# idea: start with a known list, including
	# regnum, phylum, ..., genus, species
	# - split the text into lines
	# - look for lines containing these entries
	# - look at all intermediate lines
	pass


	if __name__ == '__main__':
	main()