Skip to content

Instantly share code, notes, and snippets.

@ibaaj
Created December 3, 2022 12:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ibaaj/da199911ec2b32c13d3389d245bb0cf5 to your computer and use it in GitHub Desktop.
Save ibaaj/da199911ec2b32c13d3389d245bb0cf5 to your computer and use it in GitHub Desktop.
parsing mathgenealogy
import pprint
import re
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import json
ALLAUTHORS = {}
EDGES = []
ALREADYSCRAPED = []
IDCURRENT = 125567 # starting from Langevin, # https://www.mathgenealogy.org/id.php?id=125567
REMAININGIDTOSCRAP = []
def parseIdsAndGetIdsName(idstart):
idsNames = {}
page = requests.get("https://www.mathgenealogy.org/id.php?id=" + str(idstart))
soup = BeautifulSoup(page.content, 'html.parser')
textAdvisor = soup.find('p', style=re.compile(r'text-align: center; line-height: 2.75ex'))
try:
Links = textAdvisor.find_all('a')
for x in Links:
id = x['href'].split('=')[1]
name = x.get_text().replace(" ", " ")
print("id = " + str(id))
print("name = " + str(name))
idsNames[id] = name
except:
return {}
return idsNames
IdsNameStart = parseIdsAndGetIdsName(IDCURRENT)
ALREADYSCRAPED.append(IdsNameStart)
for idscraped in IdsNameStart:
ALLAUTHORS[idscraped] = IdsNameStart[idscraped]
REMAININGIDTOSCRAP.append(idscraped)
while len(REMAININGIDTOSCRAP) != 0:
print("len of remaining ids to scrap: " + str(REMAININGIDTOSCRAP))
popId = REMAININGIDTOSCRAP.pop()
if popId is in ALREADYSCRAPED:
continue
IdsNameNew = parseIdsAndGetIdsName(popId)
ALREADYSCRAPED.append(popId)
if len(IdsNameNew) == 0:
continue
else:
for idscraped in IdsNameNew:
ALLAUTHORS[idscraped] = IdsNameNew[idscraped]
if idscraped not in REMAININGIDTOSCRAP:
REMAININGIDTOSCRAP.append(idscraped)
if (popId,idscraped) not in EDGES:
EDGES.append((popId,idscraped))
time.sleep(1)
with open("authors.json","w") as f:
f.write(json.dumps(ALLAUTHORS))
with open('edges.txt', 'w') as f:
f.write(json.dumps(EDGES))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment