Created
February 2, 2021 16:03
-
-
Save mistertwo/e25c4fec193a111d4b2f5be283573e1a to your computer and use it in GitHub Desktop.
Class Plan Scrape and Compare. Something thrown together to compare a couple pages of degree plans, and see what classes might match.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/ | |
# and | |
# https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/ | |
#how to test run in ipython: | |
# import edu_scraper as eds; eds.page0 = "https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/"; | |
# eds.page1 = "https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/"; eds.set_target_class_number_struct("([0-9]{3}[A-Z])"); results = eds.do_stuff() | |
page0 = "" | |
page1 = "" | |
class_term_number_struct = "" # ie ([0-9]{3}[A-Z]) for 123X | |
class_subject_names = [] # where to stash possible class prefixes like Mathematics | |
# script information setters | |
# TODO: set up objects for raw HTML to handle target sections of a page | |
def set_target_class_number_struct(re_string): | |
# TODO: insert re string formatting check? | |
global class_term_number_struct | |
class_term_number_struct = re_string | |
def set_page_URLs(target_page0, target_page1): | |
# TODO: filter URLs for good pages? | |
page0 = target_page0 | |
page1 = target_page1 | |
# these functions actually do things | |
def getURL(url_string): | |
pagedata = requests.get(url_string) | |
# TODO: properly handle errors here... | |
# ConnectionError | |
# MaxRetryError | |
if pagedata.status_code != 200: | |
if debug == 1: | |
print("got a non-200 response code for URL: " + url_string[:20] + "...") | |
return None | |
return pagedata.text | |
def parse_class_info(classdata): | |
# re.findall(r"Mathematics ([0-9]{3}[A-Z])", classdata) | |
# re.findall(r".{0,20}427L.{0,20}", classdata) | |
# math_courses_uniq = list(set(math_courses)) | |
classdata_parsed = re.findall(r".{0,20}" + class_term_number_struct + ".{0,20}", classdata) | |
classdata_parsed = list(set(classdata_parsed)) | |
results_cleaned = np.array(classdata_parsed, dtype=object) | |
return list(set(results_cleaned)) | |
def crossover_check(class1, class2): | |
# bbq = np.array(results, dtype=object) | |
# math_crossover = [x for x in math_courses_uniq if x in ee_courses_uniq] | |
# math_crossover_all = [x for x in list(set(math_courses_all)) if x in list(set(ee_courses_all))] | |
results = [x for x in class1 if x in class2] | |
return results | |
def filter_entries(class_data): | |
results = class_data | |
filter_targets = ["td"] | |
for term in filter_targets: | |
results = [x for x in results if term not in x] | |
# [x for x in list(set(bbq)) if "td" not in x] | |
# TODO: filter class matches for stuff that has raw HTML in it's preceding text | |
return results | |
def refilter_pages(page_data, target_list): | |
# TODO: fix below. regex could be better. | |
target_string = "".join([("|.{15,20} " + x) for x in target_list]) | |
target_string = "(" + target_string.lstrip("|") + ")" | |
search_results = re.findall(r".{0,20}(" + target_string + ")", page_data) | |
dedupe_results = list(set(np.hstack(search_results))) | |
filter_results = [x for x in dedupe_results if "this" not in x] | |
# breakpoint() | |
return filter_results | |
def do_stuff(): | |
page0_data = getURL(page0) | |
page1_data = getURL(page1) | |
class_info_page0 = parse_class_info(page0_data) | |
class_info_page1 = parse_class_info(page1_data) | |
crossover = crossover_check(class_info_page0, class_info_page1) | |
results = refilter_pages(page0_data, crossover) | |
# print(crossover) | |
return results | |
#def get_class_prefixes(page_text, class_list): | |
#class_string = "".join([("|" + x) for x in class_list]) | |
#class_string = class_string.lstrip("|") | |
#results = re.findall(r".{0,20}(" + class_string + ")") | |
#breakpoint() | |
#return results | |
# TODO: set up limit for running time | |
# TODO: set up HTML output and server on 8080, cuz terminal is nice, but not for everyone... | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment