pnasrat/parseCourseCatalogs.py

## parseCourseCatalogs.py
#!/usr/bin/env python
# coding: utf-8
from bs4 import BeautifulSoup
import csv
import glob
import re
import json


def parseFile(fn):
    with open(fn, encoding="ISO-8859-1") as f:
        soup = BeautifulSoup(f, features="html.parser")
        pre = soup.find('pre')
        listings = pre.find_all('p')[-1].text
        return listings

def toRecords(body):
    return body.split("\n \n")

def parseRecord(rec):
    r = {}
    lead_re = re.compile(r"^([A-Z]{4}-\d{3})")
    max_re = re.compile(r"^MAX.*: (\d+)")
    fields = list(map(lambda x:x.strip(), rec.split("\n ")))
    if not fields:
        return None
    if not lead_re.match(fields[0]):
        return None

    # Handle multiple spaces in title
    cid, title, other = re.split("\s{2,}", fields[0], maxsplit=2)
    r["id"] = cid
    r["title"] = title
    r["other"] = fields[1:]
    for f in fields[1:]:
        if f.startswith("CROSS LISTED:"):
            _, xls = f.split(": ")
            r["cross"] = xls.split()
            continue
        # MAX W/CROSS LIST: 91
        m = max_re.match(f)
        if m:
            r["max"] = int(m.group(1))

    return r

if __name__ == "__main__":
    records = []
    for fn in glob.glob("*.html"):
        rs = toRecords(parseFile(fn))
        for r in rs:
            rec = parseRecord(r)
            if rec is not None:
                records.append(parseRecord(r))
    print(json.dumps(records))
	#!/usr/bin/env python
	# coding: utf-8
	from bs4 import BeautifulSoup
	import csv
	import glob
	import re
	import json


	def parseFile(fn):
	with open(fn, encoding="ISO-8859-1") as f:
	soup = BeautifulSoup(f, features="html.parser")
	pre = soup.find('pre')
	listings = pre.find_all('p')[-1].text
	return listings

	def toRecords(body):
	return body.split("\n \n")

	def parseRecord(rec):
	r = {}
	lead_re = re.compile(r"^([A-Z]{4}-\d{3})")
	max_re = re.compile(r"^MAX.*: (\d+)")
	fields = list(map(lambda x:x.strip(), rec.split("\n ")))
	if not fields:
	return None
	if not lead_re.match(fields[0]):
	return None

	# Handle multiple spaces in title
	cid, title, other = re.split("\s{2,}", fields[0], maxsplit=2)
	r["id"] = cid
	r["title"] = title
	r["other"] = fields[1:]
	for f in fields[1:]:
	if f.startswith("CROSS LISTED:"):
	_, xls = f.split(": ")
	r["cross"] = xls.split()
	continue
	# MAX W/CROSS LIST: 91
	m = max_re.match(f)
	if m:
	r["max"] = int(m.group(1))

	return r

	if __name__ == "__main__":
	records = []
	for fn in glob.glob("*.html"):
	rs = toRecords(parseFile(fn))
	for r in rs:
	rec = parseRecord(r)
	if rec is not None:
	records.append(parseRecord(r))
	print(json.dumps(records))