Skip to content

Instantly share code, notes, and snippets.

@alanbernstein
Last active November 11, 2021 04:20
Show Gist options
  • Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.
Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.
phylo.py
#!/usr/local/bin/python
import sys
import wikipedia
import requests
import re
from pprint import pprint
# TODO: cache results and use cache to build up a tree
# TODO: handle disambiguation page (e.g., for 'orange')
# alternately could use this http://www.itis.gov/index.html but requires
# a lot more work to decide which search result to use
#
# https://en.wikipedia.org/wiki/Horse
# https://en.wikipedia.org/wiki/Template:Taxobox#Classification
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects
ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus',
'species']
ranks_english = {'regnum': 'kingdom',
'phylum': 'phylum',
'classis': 'class',
'ordo': 'order',
'subordo': 'suborder',
'familia': 'family',
'genus': 'genus',
'species': 'species'}
def main():
if len(sys.argv) > 2:
searchterm1 = sys.argv[1]
searchterm2 = sys.argv[2]
else:
searchterm1 = 'dolphin'
searchterm2 = 'blue whale'
print(searchterm1, searchterm2)
tax1 = {'common name': searchterm1}
tax2 = {'common name': searchterm2}
taxobox1 = get_taxobox_from_search_term(searchterm1)
if taxobox1:
tax1 = get_taxonomy_from_taxobox(taxobox1, tax1)
else:
print('unable to retrieve info for %s' % tax1['common name'])
taxobox2 = get_taxobox_from_search_term(searchterm2)
if taxobox2:
tax2 = get_taxonomy_from_taxobox(taxobox2, tax2)
else:
print('unable to retrieve info for %s' % tax2['common name'])
if taxobox1 and taxobox2:
print_taxonomy([tax1, tax2])
get_lowest_common_node(tax1, tax2)
def get_taxonomy(search_term):
tax = {'common name': search_term}
taxobox = get_taxobox_from_search_term(search_term)
if taxobox:
tax = get_taxonomy_from_taxobox(taxobox, tax)
else:
print('unable to retrieve info for %s' % tax['common name'])
# needs to return taxobox and tax - seems like rewrite
def get_taxobox_from_search_term(term, d=0):
# idea: given a search term, try to get the "taxobox" infobox about the
# living thing described by that term. several possibilities:
# - page matching search term is the desired page, and has a taxobox
# - done, extract taxobox contents and pass on
# - page matching search term has no taxobox
# - search for taxonomy link, repeat (link preceded by 'species', 'genus', ...
# https://en.wikipedia.org/wiki/Salmon
# https://en.wikipedia.org/wiki/Hazelnut
# - page is a redirect
# https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean)
# - page contains taxobox, but taxobox is a template itself
# https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae#
#
# todo: record which of these cases occur for which terms
query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects'
resp = requests.get(query)
content = resp.text
taxobox = None
print(content)
if '#REDIRECT' in content:
s = 'is redirect'
# handled by additional parameter in request string
elif 'automatic taxobox' in content.lower():
s = 'contains taxobox template'
# find the 'taxon' entry in the taxobox, go to it
# example: spiders is 'taxon = Araneae' amd uses this:
# https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates
elif 'speciesbox' in content.lower():
s = 'contains speciesbox template'
# find the 'taxon' entry in the taxobox, go to it
# example: garlic is 'taxon = Allium sativum', and uses this:
# https://en.wikipedia.org/wiki/Template:Taxonomy/Allium
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates
elif 'taxobox' in content.lower():
s = 'contains usable taxobox'
taxobox = content
else:
s = 'contains no taxobox... '
if d == 0:
# check pages of all links for taxoboxes
# first links preceeded by the word 'species', then 'genus',
# then 'family', then all other links in the intro section
# get a list of links along with the preceeding words
p = r'[^ ]*\[\[([^]]*)\]\]'
mat = re.finditer(p, content)
links = []
for m in mat:
spind = content.rfind(' ', 0, m.start(0) - 1)
prevword = content[spind + 1:m.start(0) - 1]
links.append([prevword, m.group(1)])
# pprint(links)
# check the species/genus/family links
for prev_word in ['species', 'genus', 'family']:
links2 = [n for n in links if prev_word in n[0]]
if not taxobox:
for l in links2:
tb = get_taxobox_from_search_term(l[1], d + 1)
if tb:
taxobox = tb
s = 'retrieved info from ' + prev_word + ' link "' + l[
1] + '"'
break
# check all other links
if not taxobox:
for l in links:
tb = get_taxobox_from_search_term(l[1], d + 1)
if tb:
taxobox = tb
s = 'retrieved info from link "' + l[1] + '"'
break
else:
s = 'no taxobox in any first-level links'
if d == 0:
print(' ' * d + term + ': ' + s)
return taxobox
def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy):
# for
pass
def get_taxonomy_from_taxobox(taxobox, taxonomy):
# dolphin:
# | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n|
# blue whale:
# regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n |
#
#
# todo:
# - rank parsing issues:
# - use link name, not link url
# x remove citations
# x deal with non-link ranks and links with different names
# x deal with missing ranks
# - deal with sub, super, infra, ultra, unranked
# - deal with multiple values
# x just use first
# - get a list
PRINT = 0
if PRINT:
print('')
print(taxonomy['common name'])
print(taxobox)
# TODO:
# preprocess links:
# [[xyz|qwe abc]] -> [[qwe abc]]
# [[qwe xyz]]abc -> [[qwe xyzabc]]
#
# then:
# [[qwe xyz]] -> 'qwe xyz'
for r in ranks:
# extract 'row' of taxobox with this rank in it
p = r + r'[ ]*=[^|]*\|'
x = re.search(p, taxobox)
if x is None:
continue
row = x.group(0)
row2 = re.sub('{.*', '', row) # remove {{}} entities
# extract based on link - fails if entry not a link
p = r'\[\[[^]]*\]\]'
rnames1 = re.findall(p, row2)
rnames1 = [s[2:-2] for s in rnames1]
# extract based on other stuff -
row3 = row2.replace('[[', '')
row4 = row3.replace(']]', '')
row5 = row4.replace('*', '')
row6 = row5.replace("'", '')
p = r'=[ ]*[a-zA-Z .]*'
x = re.search(p, row6)
rname2 = x.group(0)
rname2 = rname2.replace('=', '').replace('\\n', '').strip()
taxonomy[r] = rname2
if PRINT:
print(r, row, row6, rnames1, rname2)
if len(taxonomy) < 2:
print('no data found for %s' % taxonomy['common name'])
return taxonomy
def print_taxonomy(taxa):
s1 = '%10s ' % ''
s2 = ''
for t in taxa:
s2 = s2 + '%15s' % t['common name']
if len(s2.strip()) > 0:
print(s1 + s2)
for r in ranks:
s1 = '%10s: ' % ranks_english[r]
s2 = ''
for t in taxa:
if r in t.keys():
s2 = s2 + '%15s' % t[r]
else:
s2 = s2 + '%15s' % ' '
if len(s2.strip()) > 0:
print(s1 + s2)
def get_lowest_common_node(a, b):
for r in reversed(ranks):
if r in a.keys() and r in b.keys() and a[r] == b[r]:
print('"%s" and "%s" share the same %s (%s)' %
(a['common name'], b['common name'], ranks_english[r], a[r]))
break
def extract_ranks(text):
# idea: start with a known list, including
# regnum, phylum, ..., genus, species
# - split the text into lines
# - look for lines containing these entries
# - look at all intermediate lines
pass
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment