mistertwo/edu_scraper.py

## edu_scraper.py
# https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/
# and
# https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/

#how to test run in ipython:
# import edu_scraper as eds; eds.page0 = "https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/";
# eds.page1 = "https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/"; eds.set_target_class_number_struct("([0-9]{3}[A-Z])"); results = eds.do_stuff()

page0 = ""
page1 = ""
class_term_number_struct = "" # ie ([0-9]{3}[A-Z]) for 123X
class_subject_names = [] # where to stash possible class prefixes like Mathematics

# script information setters
# TODO: set up objects for raw HTML to handle target sections of a page
def set_target_class_number_struct(re_string):
    # TODO: insert re string formatting check?
    global class_term_number_struct
    class_term_number_struct = re_string

def set_page_URLs(target_page0, target_page1):
    # TODO: filter URLs for good pages?
    page0 = target_page0
    page1 = target_page1


# these functions actually do things
def getURL(url_string):
    pagedata = requests.get(url_string)
    # TODO: properly handle errors here...
    # ConnectionError
    # MaxRetryError
    if pagedata.status_code != 200:
        if debug == 1:
            print("got a non-200 response code for URL: " + url_string[:20] + "...")
        return None
    return pagedata.text

def parse_class_info(classdata):
    # re.findall(r"Mathematics ([0-9]{3}[A-Z])", classdata)
    # re.findall(r".{0,20}427L.{0,20}", classdata)
    # math_courses_uniq = list(set(math_courses))

    classdata_parsed = re.findall(r".{0,20}" + class_term_number_struct + ".{0,20}", classdata)
    classdata_parsed = list(set(classdata_parsed))
    results_cleaned = np.array(classdata_parsed, dtype=object)
    return list(set(results_cleaned))

def crossover_check(class1, class2):
    # bbq = np.array(results, dtype=object)
    # math_crossover = [x for x in math_courses_uniq if x in ee_courses_uniq]
    # math_crossover_all = [x for x in list(set(math_courses_all)) if x in list(set(ee_courses_all))]
    results = [x for x in class1 if x in class2]
    return results

def filter_entries(class_data):
    results = class_data
    filter_targets = ["td"]
    for term in filter_targets:
        results = [x for x in results if term not in x]
    # [x for x in list(set(bbq)) if "td" not in x]
    # TODO: filter class matches for stuff that has raw HTML in it's preceding text
    return results

def refilter_pages(page_data, target_list):
    # TODO: fix below. regex could be better.
    target_string = "".join([("|.{15,20} " + x) for x in target_list])
    target_string = "(" + target_string.lstrip("|") + ")"
    search_results = re.findall(r".{0,20}(" + target_string + ")", page_data)
    dedupe_results = list(set(np.hstack(search_results)))
    filter_results = [x for x in dedupe_results if "this" not in x]
    # breakpoint()
    return filter_results

def do_stuff():
    page0_data = getURL(page0)
    page1_data = getURL(page1)
    class_info_page0 = parse_class_info(page0_data)
    class_info_page1 = parse_class_info(page1_data)
    crossover = crossover_check(class_info_page0, class_info_page1)
    results = refilter_pages(page0_data, crossover)
    # print(crossover)
    return results

#def get_class_prefixes(page_text, class_list):
    #class_string = "".join([("|" + x) for x in class_list])
    #class_string = class_string.lstrip("|")
    #results = re.findall(r".{0,20}(" + class_string + ")")

    #breakpoint()
    #return results

# TODO: set up limit for running time
# TODO: set up HTML output and server on 8080, cuz terminal is nice, but not for everyone...
	# https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/
	# and
	# https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/

	#how to test run in ipython:
	# import edu_scraper as eds; eds.page0 = "https://catalog.utexas.edu/undergraduate/engineering/degrees-and-programs/bs-electrical-engineering/";
	# eds.page1 = "https://catalog.utexas.edu/undergraduate/natural-sciences/degrees-and-programs/bs-mathematics/"; eds.set_target_class_number_struct("([0-9]{3}[A-Z])"); results = eds.do_stuff()

	page0 = ""
	page1 = ""
	class_term_number_struct = "" # ie ([0-9]{3}[A-Z]) for 123X
	class_subject_names = [] # where to stash possible class prefixes like Mathematics

	# script information setters
	# TODO: set up objects for raw HTML to handle target sections of a page
	def set_target_class_number_struct(re_string):
	# TODO: insert re string formatting check?
	global class_term_number_struct
	class_term_number_struct = re_string

	def set_page_URLs(target_page0, target_page1):
	# TODO: filter URLs for good pages?
	page0 = target_page0
	page1 = target_page1


	# these functions actually do things
	def getURL(url_string):
	pagedata = requests.get(url_string)
	# TODO: properly handle errors here...
	# ConnectionError
	# MaxRetryError
	if pagedata.status_code != 200:
	if debug == 1:
	print("got a non-200 response code for URL: " + url_string[:20] + "...")
	return None
	return pagedata.text

	def parse_class_info(classdata):
	# re.findall(r"Mathematics ([0-9]{3}[A-Z])", classdata)
	# re.findall(r".{0,20}427L.{0,20}", classdata)
	# math_courses_uniq = list(set(math_courses))

	classdata_parsed = re.findall(r".{0,20}" + class_term_number_struct + ".{0,20}", classdata)
	classdata_parsed = list(set(classdata_parsed))
	results_cleaned = np.array(classdata_parsed, dtype=object)
	return list(set(results_cleaned))

	def crossover_check(class1, class2):
	# bbq = np.array(results, dtype=object)
	# math_crossover = [x for x in math_courses_uniq if x in ee_courses_uniq]
	# math_crossover_all = [x for x in list(set(math_courses_all)) if x in list(set(ee_courses_all))]
	results = [x for x in class1 if x in class2]
	return results

	def filter_entries(class_data):
	results = class_data
	filter_targets = ["td"]
	for term in filter_targets:
	results = [x for x in results if term not in x]
	# [x for x in list(set(bbq)) if "td" not in x]
	# TODO: filter class matches for stuff that has raw HTML in it's preceding text
	return results

	def refilter_pages(page_data, target_list):
	# TODO: fix below. regex could be better.
	target_string = "".join([("\|.{15,20} " + x) for x in target_list])
	target_string = "(" + target_string.lstrip("\|") + ")"
	search_results = re.findall(r".{0,20}(" + target_string + ")", page_data)
	dedupe_results = list(set(np.hstack(search_results)))
	filter_results = [x for x in dedupe_results if "this" not in x]
	# breakpoint()
	return filter_results

	def do_stuff():
	page0_data = getURL(page0)
	page1_data = getURL(page1)
	class_info_page0 = parse_class_info(page0_data)
	class_info_page1 = parse_class_info(page1_data)
	crossover = crossover_check(class_info_page0, class_info_page1)
	results = refilter_pages(page0_data, crossover)
	# print(crossover)
	return results

	#def get_class_prefixes(page_text, class_list):
	#class_string = "".join([("\|" + x) for x in class_list])
	#class_string = class_string.lstrip("\|")
	#results = re.findall(r".{0,20}(" + class_string + ")")

	#breakpoint()
	#return results

	# TODO: set up limit for running time
	# TODO: set up HTML output and server on 8080, cuz terminal is nice, but not for everyone...