Skip to content

Instantly share code, notes, and snippets.

@mistertwo
Created February 2, 2021 16:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mistertwo/e25c4fec193a111d4b2f5be283573e1a to your computer and use it in GitHub Desktop.
Save mistertwo/e25c4fec193a111d4b2f5be283573e1a to your computer and use it in GitHub Desktop.
Class Plan Scrape and Compare. Something thrown together to compare a couple pages of degree plans, and see what classes might match.
# https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/
# and
# https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/
#how to test run in ipython:
# import edu_scraper as eds; eds.page0 = "https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/";
# eds.page1 = "https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/"; eds.set_target_class_number_struct("([0-9]{3}[A-Z])"); results = eds.do_stuff()
page0 = ""
page1 = ""
class_term_number_struct = "" # ie ([0-9]{3}[A-Z]) for 123X
class_subject_names = [] # where to stash possible class prefixes like Mathematics
# script information setters
# TODO: set up objects for raw HTML to handle target sections of a page
def set_target_class_number_struct(re_string):
# TODO: insert re string formatting check?
global class_term_number_struct
class_term_number_struct = re_string
def set_page_URLs(target_page0, target_page1):
# TODO: filter URLs for good pages?
page0 = target_page0
page1 = target_page1
# these functions actually do things
def getURL(url_string):
pagedata = requests.get(url_string)
# TODO: properly handle errors here...
# ConnectionError
# MaxRetryError
if pagedata.status_code != 200:
if debug == 1:
print("got a non-200 response code for URL: " + url_string[:20] + "...")
return None
return pagedata.text
def parse_class_info(classdata):
# re.findall(r"Mathematics ([0-9]{3}[A-Z])", classdata)
# re.findall(r".{0,20}427L.{0,20}", classdata)
# math_courses_uniq = list(set(math_courses))
classdata_parsed = re.findall(r".{0,20}" + class_term_number_struct + ".{0,20}", classdata)
classdata_parsed = list(set(classdata_parsed))
results_cleaned = np.array(classdata_parsed, dtype=object)
return list(set(results_cleaned))
def crossover_check(class1, class2):
# bbq = np.array(results, dtype=object)
# math_crossover = [x for x in math_courses_uniq if x in ee_courses_uniq]
# math_crossover_all = [x for x in list(set(math_courses_all)) if x in list(set(ee_courses_all))]
results = [x for x in class1 if x in class2]
return results
def filter_entries(class_data):
results = class_data
filter_targets = ["td"]
for term in filter_targets:
results = [x for x in results if term not in x]
# [x for x in list(set(bbq)) if "td" not in x]
# TODO: filter class matches for stuff that has raw HTML in it's preceding text
return results
def refilter_pages(page_data, target_list):
# TODO: fix below. regex could be better.
target_string = "".join([("|.{15,20} " + x) for x in target_list])
target_string = "(" + target_string.lstrip("|") + ")"
search_results = re.findall(r".{0,20}(" + target_string + ")", page_data)
dedupe_results = list(set(np.hstack(search_results)))
filter_results = [x for x in dedupe_results if "this" not in x]
# breakpoint()
return filter_results
def do_stuff():
page0_data = getURL(page0)
page1_data = getURL(page1)
class_info_page0 = parse_class_info(page0_data)
class_info_page1 = parse_class_info(page1_data)
crossover = crossover_check(class_info_page0, class_info_page1)
results = refilter_pages(page0_data, crossover)
# print(crossover)
return results
#def get_class_prefixes(page_text, class_list):
#class_string = "".join([("|" + x) for x in class_list])
#class_string = class_string.lstrip("|")
#results = re.findall(r".{0,20}(" + class_string + ")")
#breakpoint()
#return results
# TODO: set up limit for running time
# TODO: set up HTML output and server on 8080, cuz terminal is nice, but not for everyone...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment