Skip to content

Instantly share code, notes, and snippets.

@m4lvin
Last active April 27, 2020 10:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m4lvin/37bf473ee4e662fa031fc267c67464a9 to your computer and use it in GitHub Desktop.
Save m4lvin/37bf473ee4e662fa031fc267c67464a9 to your computer and use it in GitHub Desktop.
Crawl the Mathematics Genealogy Project backwards, starting with two IDs and stopping when a common ancestor is found.
#!/usr/bin/python
'''
Crawl the Mathematics Genealogy Project backwards, starting with two IDs and stopping when a common ancestor is found.
The output without the lines tarting with "[" is a graph in DOT format for graphviz.
Based on a script by filip (2012-07-25) from https://github.com/eakbas/mathgen/
Edited by m4lvin (2019-05-08).
'''
import re
import sys
import time
import requests
import queue
# change this to the two IDs for which you want to find a common ancestor.
idA = "258902"
idB = "235210"
prefix = "https://genealogy.math.ndsu.nodak.edu/id.php?id="
def get_page(url):
try:
req = requests.get(url)
return req.text
except:
return "get_page(url) failed"
def remove_excess_space(s):
return re.sub(r'\s+', " ", s.strip())
def get_name(page):
search_result = re.search(r'(.*)</h2>', page)
return search_result.group(1) if search_result != None else "NOT FOUND"
def get_year(page):
search_result = re.search(r'(\d{4})</span>', page)
return search_result.group(1) if search_result != None else "NOT FOUND"
def get_advisor_ids(page):
advs = re.findall(r'Advisor.*?:.*?<a href="id\.php\?id=(\d*?)">', page)
otrs = re.findall(r'otor.*?:.*?<a href="id\.php\?id=(\d*?)">', page)
return (advs + otrs)
def crawl(cache,todo,math_id):
if math_id not in cache:
page = get_page(prefix + math_id)
name = remove_excess_space(get_name(page))
year = remove_excess_space(get_year(page))
print('{0} "{1} {2}"'.format(math_id, name, year))
cache[math_id] = name+"\n"+year
for advisor_id in get_advisor_ids(page):
print('{0} -> {1};'.format(advisor_id, math_id))
todo.put(advisor_id)
# be nice to MathGenealogy servers and wait a second after each query
time.sleep(1)
return cache[math_id]
def connected(c1, c2):
overlap = c1.keys() & c2.keys()
if overlap:
print("connected!")
print(overlap)
return True
else:
return False
if __name__ == '__main__':
cache1 = {}
cache2 = {}
todo1 = queue.Queue()
todo2 = queue.Queue()
todo1.put(idA)
todo2.put(idB)
while not todo1.empty() and not connected(cache1, cache2):
print ("[current status:", todo1.qsize(), todo2.qsize())
sys.stdout.flush()
crawl(cache1,todo1,todo1.get())
if not todo2.empty():
crawl(cache2,todo2,todo2.get())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment