Skip to content

Instantly share code, notes, and snippets.

@jsundram
Last active February 8, 2018 00:47
Show Gist options
  • Save jsundram/913c5749c10b241e9cce1a74bde47847 to your computer and use it in GitHub Desktop.
Save jsundram/913c5749c10b241e9cce1a74bde47847 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import csv
import json
import re
import random
import requests
import time
import urllib
BASE = 'http://web.mit.edu/bin/cgicso?query'
def get_names(filename):
names = []
with open(filename) as f:
r = csv.reader(f)
header = r.next()
name_ix = header.index('Supervisor')
for row in r:
names.append(row[name_ix])
return names
def fix_name(name):
"""Names are given as "Last /First Middle"
Query interface takes First Last.
1) Order
2) remove initials (if present)
"""
last, firstm = name.split(' /')
ordered = firstm + ' ' + last
no_middle = re.sub('\s[A-Z][\.]?\s', ' ', ordered)
# get rid of first initial too!
return re.sub('^[A-Z][\.]?\s', '', no_middle)
def parse(contents):
soup = BeautifulSoup(contents, 'html.parser')
pre = soup.find_all('pre')[0]
info = pre.contents[-1].split('\n')
data = {}
for line in info:
if line:
fieldname, value = line.strip().split(': ')
data[fieldname] = value
return data
def get_pages(names):
data = []
err_insufficient_data = 500
for n in names:
name = fix_name(n)
query = urllib.urlencode({'': name})
url = BASE + query
print(url)
r = requests.get(url)
with open(query[1:] + '.txt', 'w') as f:
f.write(r.content)
try:
record = parse(r.content)
record['name'] = name
except Exception as e:
print("problem with %s: %s" % (name, e))
# Possibilities:
# 1: No Results
# A: (not current faculty)
# examples: Scott Aaronson
# B: Wrong form of name (middle name included)
# examples: Keith Adam Nelson
# 2: multiple records found?
# 3: Rate limited?
if len(r.content) < err_insufficient_data:
raise Exception("Insuficient data: were we rate-limited?")
record = {'name': name}
data.append(record)
# sleep to avoid undue traffic on search service
# TODO: may need to sleep longer to avoid getting IP banned?
time.sleep(1 + random.random() * 5)
return data
def main():
names = get_names('research.csv')
names = filter(None, set(names)) # Get rid of '', deduplicate
try:
data = get_pages(names)
except KeyboardInterrupt:
pass
print("saving results to data.json")
with open('data.json', 'w') as f:
json.dump(data, f, indent=4)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment