jsundram/scrape.py

## scrape.py
from bs4 import BeautifulSoup
import csv
import json
import re
import random
import requests
import time
import urllib

BASE = 'http://web.mit.edu/bin/cgicso?query'

def get_names(filename):
    names = []
    with open(filename) as f:
        r = csv.reader(f)
        header = r.next()
        name_ix = header.index('Supervisor')
        for row in r:
            names.append(row[name_ix])
    return names


def fix_name(name):
    """Names are given as "Last /First Middle"

        Query interface takes First Last.
        1) Order
        2) remove initials (if present)
    """
    last, firstm = name.split(' /')
    ordered = firstm + ' ' + last
    no_middle = re.sub('\s[A-Z][\.]?\s', ' ', ordered)
    # get rid of first initial too!
    return re.sub('^[A-Z][\.]?\s', '', no_middle)


def parse(contents):
    soup = BeautifulSoup(contents, 'html.parser')
    pre = soup.find_all('pre')[0]
    info = pre.contents[-1].split('\n')
    data = {}
    for line  in info:
        if line:
            fieldname, value = line.strip().split(': ')
            data[fieldname] = value
    return data


def get_pages(names):
    data = []
    err_insufficient_data = 500
    for n in names:
        name = fix_name(n)
        query = urllib.urlencode({'': name})
        url = BASE + query
        print(url)
        r = requests.get(url)

        with open(query[1:] + '.txt', 'w') as f:
            f.write(r.content)

        try:
            record = parse(r.content)
            record['name'] = name
        except Exception as e:
            print("problem with %s: %s" % (name, e))
            # Possibilities:
            # 1: No Results
            #   A: (not current faculty)
            #       examples: Scott Aaronson
            #   B: Wrong form of name (middle name included)
            #       examples:  Keith Adam Nelson
            # 2: multiple records found?
            # 3: Rate limited?
            if len(r.content) < err_insufficient_data:
                raise Exception("Insuficient data: were we rate-limited?")

            record = {'name': name}

        data.append(record)

        # sleep to avoid undue traffic on search service
        # TODO: may need to sleep longer to avoid getting IP banned?
        time.sleep(1 + random.random() * 5)

    return data


def main():
    names = get_names('research.csv')
    names = filter(None, set(names))  # Get rid of '', deduplicate

    try:
        data = get_pages(names)
    except KeyboardInterrupt:
        pass

    print("saving results to data.json")
    with open('data.json', 'w') as f:
        json.dump(data, f, indent=4)


if __name__ == '__main__':
    main()
	from bs4 import BeautifulSoup
	import csv
	import json
	import re
	import random
	import requests
	import time
	import urllib

	BASE = 'http://web.mit.edu/bin/cgicso?query'

	def get_names(filename):
	names = []
	with open(filename) as f:
	r = csv.reader(f)
	header = r.next()
	name_ix = header.index('Supervisor')
	for row in r:
	names.append(row[name_ix])
	return names


	def fix_name(name):
	"""Names are given as "Last /First Middle"

	Query interface takes First Last.
	1) Order
	2) remove initials (if present)
	"""
	last, firstm = name.split(' /')
	ordered = firstm + ' ' + last
	no_middle = re.sub('\s[A-Z][\.]?\s', ' ', ordered)
	# get rid of first initial too!
	return re.sub('^[A-Z][\.]?\s', '', no_middle)


	def parse(contents):
	soup = BeautifulSoup(contents, 'html.parser')
	pre = soup.find_all('pre')[0]
	info = pre.contents[-1].split('\n')
	data = {}
	for line in info:
	if line:
	fieldname, value = line.strip().split(': ')
	data[fieldname] = value
	return data


	def get_pages(names):
	data = []
	err_insufficient_data = 500
	for n in names:
	name = fix_name(n)
	query = urllib.urlencode({'': name})
	url = BASE + query
	print(url)
	r = requests.get(url)

	with open(query[1:] + '.txt', 'w') as f:
	f.write(r.content)

	try:
	record = parse(r.content)
	record['name'] = name
	except Exception as e:
	print("problem with %s: %s" % (name, e))
	# Possibilities:
	# 1: No Results
	# A: (not current faculty)
	# examples: Scott Aaronson
	# B: Wrong form of name (middle name included)
	# examples: Keith Adam Nelson
	# 2: multiple records found?
	# 3: Rate limited?
	if len(r.content) < err_insufficient_data:
	raise Exception("Insuficient data: were we rate-limited?")

	record = {'name': name}

	data.append(record)

	# sleep to avoid undue traffic on search service
	# TODO: may need to sleep longer to avoid getting IP banned?
	time.sleep(1 + random.random() * 5)

	return data


	def main():
	names = get_names('research.csv')
	names = filter(None, set(names)) # Get rid of '', deduplicate

	try:
	data = get_pages(names)
	except KeyboardInterrupt:
	pass

	print("saving results to data.json")
	with open('data.json', 'w') as f:
	json.dump(data, f, indent=4)


	if __name__ == '__main__':
	main()