Skip to content

Instantly share code, notes, and snippets.

@pnasrat
Created May 22, 2020 15:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pnasrat/824e6a132c3f51b42506e892cb92648a to your computer and use it in GitHub Desktop.
Save pnasrat/824e6a132c3f51b42506e892cb92648a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
from bs4 import BeautifulSoup
import csv
import glob
import re
import json
def parseFile(fn):
with open(fn, encoding="ISO-8859-1") as f:
soup = BeautifulSoup(f, features="html.parser")
pre = soup.find('pre')
listings = pre.find_all('p')[-1].text
return listings
def toRecords(body):
return body.split("\n \n")
def parseRecord(rec):
r = {}
lead_re = re.compile(r"^([A-Z]{4}-\d{3})")
max_re = re.compile(r"^MAX.*: (\d+)")
fields = list(map(lambda x:x.strip(), rec.split("\n ")))
if not fields:
return None
if not lead_re.match(fields[0]):
return None
# Handle multiple spaces in title
cid, title, other = re.split("\s{2,}", fields[0], maxsplit=2)
r["id"] = cid
r["title"] = title
r["other"] = fields[1:]
for f in fields[1:]:
if f.startswith("CROSS LISTED:"):
_, xls = f.split(": ")
r["cross"] = xls.split()
continue
# MAX W/CROSS LIST: 91
m = max_re.match(f)
if m:
r["max"] = int(m.group(1))
return r
if __name__ == "__main__":
records = []
for fn in glob.glob("*.html"):
rs = toRecords(parseFile(fn))
for r in rs:
rec = parseRecord(r)
if rec is not None:
records.append(parseRecord(r))
print(json.dumps(records))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment