Last active
February 8, 2018 00:47
-
-
Save jsundram/913c5749c10b241e9cce1a74bde47847 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import csv | |
| import json | |
| import re | |
| import random | |
| import requests | |
| import time | |
| import urllib | |
| BASE = 'http://web.mit.edu/bin/cgicso?query' | |
| def get_names(filename): | |
| names = [] | |
| with open(filename) as f: | |
| r = csv.reader(f) | |
| header = r.next() | |
| name_ix = header.index('Supervisor') | |
| for row in r: | |
| names.append(row[name_ix]) | |
| return names | |
| def fix_name(name): | |
| """Names are given as "Last /First Middle" | |
| Query interface takes First Last. | |
| 1) Order | |
| 2) remove initials (if present) | |
| """ | |
| last, firstm = name.split(' /') | |
| ordered = firstm + ' ' + last | |
| no_middle = re.sub('\s[A-Z][\.]?\s', ' ', ordered) | |
| # get rid of first initial too! | |
| return re.sub('^[A-Z][\.]?\s', '', no_middle) | |
| def parse(contents): | |
| soup = BeautifulSoup(contents, 'html.parser') | |
| pre = soup.find_all('pre')[0] | |
| info = pre.contents[-1].split('\n') | |
| data = {} | |
| for line in info: | |
| if line: | |
| fieldname, value = line.strip().split(': ') | |
| data[fieldname] = value | |
| return data | |
| def get_pages(names): | |
| data = [] | |
| err_insufficient_data = 500 | |
| for n in names: | |
| name = fix_name(n) | |
| query = urllib.urlencode({'': name}) | |
| url = BASE + query | |
| print(url) | |
| r = requests.get(url) | |
| with open(query[1:] + '.txt', 'w') as f: | |
| f.write(r.content) | |
| try: | |
| record = parse(r.content) | |
| record['name'] = name | |
| except Exception as e: | |
| print("problem with %s: %s" % (name, e)) | |
| # Possibilities: | |
| # 1: No Results | |
| # A: (not current faculty) | |
| # examples: Scott Aaronson | |
| # B: Wrong form of name (middle name included) | |
| # examples: Keith Adam Nelson | |
| # 2: multiple records found? | |
| # 3: Rate limited? | |
| if len(r.content) < err_insufficient_data: | |
| raise Exception("Insuficient data: were we rate-limited?") | |
| record = {'name': name} | |
| data.append(record) | |
| # sleep to avoid undue traffic on search service | |
| # TODO: may need to sleep longer to avoid getting IP banned? | |
| time.sleep(1 + random.random() * 5) | |
| return data | |
| def main(): | |
| names = get_names('research.csv') | |
| names = filter(None, set(names)) # Get rid of '', deduplicate | |
| try: | |
| data = get_pages(names) | |
| except KeyboardInterrupt: | |
| pass | |
| print("saving results to data.json") | |
| with open('data.json', 'w') as f: | |
| json.dump(data, f, indent=4) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment