Last active
November 11, 2021 04:20
-
-
Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.
phylo.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import sys | |
import wikipedia | |
import requests | |
import re | |
from pprint import pprint | |
# TODO: cache results and use cache to build up a tree | |
# TODO: handle disambiguation page (e.g., for 'orange') | |
# alternately could use this http://www.itis.gov/index.html but requires | |
# a lot more work to decide which search result to use | |
# | |
# https://en.wikipedia.org/wiki/Horse | |
# https://en.wikipedia.org/wiki/Template:Taxobox#Classification | |
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects | |
ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus', | |
'species'] | |
ranks_english = {'regnum': 'kingdom', | |
'phylum': 'phylum', | |
'classis': 'class', | |
'ordo': 'order', | |
'subordo': 'suborder', | |
'familia': 'family', | |
'genus': 'genus', | |
'species': 'species'} | |
def main(): | |
if len(sys.argv) > 2: | |
searchterm1 = sys.argv[1] | |
searchterm2 = sys.argv[2] | |
else: | |
searchterm1 = 'dolphin' | |
searchterm2 = 'blue whale' | |
print(searchterm1, searchterm2) | |
tax1 = {'common name': searchterm1} | |
tax2 = {'common name': searchterm2} | |
taxobox1 = get_taxobox_from_search_term(searchterm1) | |
if taxobox1: | |
tax1 = get_taxonomy_from_taxobox(taxobox1, tax1) | |
else: | |
print('unable to retrieve info for %s' % tax1['common name']) | |
taxobox2 = get_taxobox_from_search_term(searchterm2) | |
if taxobox2: | |
tax2 = get_taxonomy_from_taxobox(taxobox2, tax2) | |
else: | |
print('unable to retrieve info for %s' % tax2['common name']) | |
if taxobox1 and taxobox2: | |
print_taxonomy([tax1, tax2]) | |
get_lowest_common_node(tax1, tax2) | |
def get_taxonomy(search_term): | |
tax = {'common name': search_term} | |
taxobox = get_taxobox_from_search_term(search_term) | |
if taxobox: | |
tax = get_taxonomy_from_taxobox(taxobox, tax) | |
else: | |
print('unable to retrieve info for %s' % tax['common name']) | |
# needs to return taxobox and tax - seems like rewrite | |
def get_taxobox_from_search_term(term, d=0): | |
# idea: given a search term, try to get the "taxobox" infobox about the | |
# living thing described by that term. several possibilities: | |
# - page matching search term is the desired page, and has a taxobox | |
# - done, extract taxobox contents and pass on | |
# - page matching search term has no taxobox | |
# - search for taxonomy link, repeat (link preceded by 'species', 'genus', ... | |
# https://en.wikipedia.org/wiki/Salmon | |
# https://en.wikipedia.org/wiki/Hazelnut | |
# - page is a redirect | |
# https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean) | |
# - page contains taxobox, but taxobox is a template itself | |
# https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae# | |
# | |
# todo: record which of these cases occur for which terms | |
query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects' | |
resp = requests.get(query) | |
content = resp.text | |
taxobox = None | |
print(content) | |
if '#REDIRECT' in content: | |
s = 'is redirect' | |
# handled by additional parameter in request string | |
elif 'automatic taxobox' in content.lower(): | |
s = 'contains taxobox template' | |
# find the 'taxon' entry in the taxobox, go to it | |
# example: spiders is 'taxon = Araneae' amd uses this: | |
# https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae | |
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates | |
elif 'speciesbox' in content.lower(): | |
s = 'contains speciesbox template' | |
# find the 'taxon' entry in the taxobox, go to it | |
# example: garlic is 'taxon = Allium sativum', and uses this: | |
# https://en.wikipedia.org/wiki/Template:Taxonomy/Allium | |
# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates | |
elif 'taxobox' in content.lower(): | |
s = 'contains usable taxobox' | |
taxobox = content | |
else: | |
s = 'contains no taxobox... ' | |
if d == 0: | |
# check pages of all links for taxoboxes | |
# first links preceeded by the word 'species', then 'genus', | |
# then 'family', then all other links in the intro section | |
# get a list of links along with the preceeding words | |
p = r'[^ ]*\[\[([^]]*)\]\]' | |
mat = re.finditer(p, content) | |
links = [] | |
for m in mat: | |
spind = content.rfind(' ', 0, m.start(0) - 1) | |
prevword = content[spind + 1:m.start(0) - 1] | |
links.append([prevword, m.group(1)]) | |
# pprint(links) | |
# check the species/genus/family links | |
for prev_word in ['species', 'genus', 'family']: | |
links2 = [n for n in links if prev_word in n[0]] | |
if not taxobox: | |
for l in links2: | |
tb = get_taxobox_from_search_term(l[1], d + 1) | |
if tb: | |
taxobox = tb | |
s = 'retrieved info from ' + prev_word + ' link "' + l[ | |
1] + '"' | |
break | |
# check all other links | |
if not taxobox: | |
for l in links: | |
tb = get_taxobox_from_search_term(l[1], d + 1) | |
if tb: | |
taxobox = tb | |
s = 'retrieved info from link "' + l[1] + '"' | |
break | |
else: | |
s = 'no taxobox in any first-level links' | |
if d == 0: | |
print(' ' * d + term + ': ' + s) | |
return taxobox | |
def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy): | |
# for | |
pass | |
def get_taxonomy_from_taxobox(taxobox, taxonomy): | |
# dolphin: | |
# | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n| | |
# blue whale: | |
# regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n | | |
# | |
# | |
# todo: | |
# - rank parsing issues: | |
# - use link name, not link url | |
# x remove citations | |
# x deal with non-link ranks and links with different names | |
# x deal with missing ranks | |
# - deal with sub, super, infra, ultra, unranked | |
# - deal with multiple values | |
# x just use first | |
# - get a list | |
PRINT = 0 | |
if PRINT: | |
print('') | |
print(taxonomy['common name']) | |
print(taxobox) | |
# TODO: | |
# preprocess links: | |
# [[xyz|qwe abc]] -> [[qwe abc]] | |
# [[qwe xyz]]abc -> [[qwe xyzabc]] | |
# | |
# then: | |
# [[qwe xyz]] -> 'qwe xyz' | |
for r in ranks: | |
# extract 'row' of taxobox with this rank in it | |
p = r + r'[ ]*=[^|]*\|' | |
x = re.search(p, taxobox) | |
if x is None: | |
continue | |
row = x.group(0) | |
row2 = re.sub('{.*', '', row) # remove {{}} entities | |
# extract based on link - fails if entry not a link | |
p = r'\[\[[^]]*\]\]' | |
rnames1 = re.findall(p, row2) | |
rnames1 = [s[2:-2] for s in rnames1] | |
# extract based on other stuff - | |
row3 = row2.replace('[[', '') | |
row4 = row3.replace(']]', '') | |
row5 = row4.replace('*', '') | |
row6 = row5.replace("'", '') | |
p = r'=[ ]*[a-zA-Z .]*' | |
x = re.search(p, row6) | |
rname2 = x.group(0) | |
rname2 = rname2.replace('=', '').replace('\\n', '').strip() | |
taxonomy[r] = rname2 | |
if PRINT: | |
print(r, row, row6, rnames1, rname2) | |
if len(taxonomy) < 2: | |
print('no data found for %s' % taxonomy['common name']) | |
return taxonomy | |
def print_taxonomy(taxa): | |
s1 = '%10s ' % '' | |
s2 = '' | |
for t in taxa: | |
s2 = s2 + '%15s' % t['common name'] | |
if len(s2.strip()) > 0: | |
print(s1 + s2) | |
for r in ranks: | |
s1 = '%10s: ' % ranks_english[r] | |
s2 = '' | |
for t in taxa: | |
if r in t.keys(): | |
s2 = s2 + '%15s' % t[r] | |
else: | |
s2 = s2 + '%15s' % ' ' | |
if len(s2.strip()) > 0: | |
print(s1 + s2) | |
def get_lowest_common_node(a, b): | |
for r in reversed(ranks): | |
if r in a.keys() and r in b.keys() and a[r] == b[r]: | |
print('"%s" and "%s" share the same %s (%s)' % | |
(a['common name'], b['common name'], ranks_english[r], a[r])) | |
break | |
def extract_ranks(text): | |
# idea: start with a known list, including | |
# regnum, phylum, ..., genus, species | |
# - split the text into lines | |
# - look for lines containing these entries | |
# - look at all intermediate lines | |
pass | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment