BetterProgramming/masteraggreg.py

## masteraggreg.py
# Get outer element
def outer_element(page, identifier):
    root = page.find(*identifier)
        if root == None:
        raise Exception("Could not find root element")
     return root


# Remove unwanted elements
def trim_unwanted(page, identifier_list):
    # Check if list has elements
    if len(identifier_list) != 0:
        for identifier in identifier_list:
            for element in page.find_all(*identifier):
                element.decompose()
    return page


# Extract text
def get_text(page, identifier_list):
    # Check if list has elements
    if len(identifier_list) == 0:
        raise Exception("Need text elements")
    page_text = []

    for identifier in identifier_list:
        for element in page.find_all(*identifier):
            page_text.append(element.text)
        return page_text


# Get page config
def load_scrape_config():
    '''Loads page scraping config data'''
    return get_scrape_config()


# Get the scraping config for the site
def get_site_config(url):
    '''Get the scrape config for the site'''
    domain = extract_domain(url)
    config_data = load_scrape_config()
    config = config_data.get(domain, None)
    if config == None:
        raise Exception(f"Config does not exist for: {domain}")
    return config


# Build Soup
def page_processer(request):
    '''Returns Article Text'''
    # Get the page scrape config
    site_config = get_site_config(request.url)

    # Soupify page
    soup = BeautifulSoup(request.text, 'lxml')

    # Retrieve root element
    root = outer_element(soup, site_config["root_element"])
    # Remove unwanted elements
    trimmed_tree = trim_unwanted(root, site_config["unwanted"])
    # Get the desired elements
    text = get_text(trimmed_tree, site_config["text_elements"])
    return " ".join(text)
	# Get outer element
	def outer_element(page, identifier):
	root = page.find(*identifier)
	if root == None:
	raise Exception("Could not find root element")
	return root


	# Remove unwanted elements
	def trim_unwanted(page, identifier_list):
	# Check if list has elements
	if len(identifier_list) != 0:
	for identifier in identifier_list:
	for element in page.find_all(*identifier):
	element.decompose()
	return page


	# Extract text
	def get_text(page, identifier_list):
	# Check if list has elements
	if len(identifier_list) == 0:
	raise Exception("Need text elements")
	page_text = []

	for identifier in identifier_list:
	for element in page.find_all(*identifier):
	page_text.append(element.text)
	return page_text


	# Get page config
	def load_scrape_config():
	'''Loads page scraping config data'''
	return get_scrape_config()


	# Get the scraping config for the site
	def get_site_config(url):
	'''Get the scrape config for the site'''
	domain = extract_domain(url)
	config_data = load_scrape_config()
	config = config_data.get(domain, None)
	if config == None:
	raise Exception(f"Config does not exist for: {domain}")
	return config


	# Build Soup
	def page_processer(request):
	'''Returns Article Text'''
	# Get the page scrape config
	site_config = get_site_config(request.url)

	# Soupify page
	soup = BeautifulSoup(request.text, 'lxml')

	# Retrieve root element
	root = outer_element(soup, site_config["root_element"])
	# Remove unwanted elements
	trimmed_tree = trim_unwanted(root, site_config["unwanted"])
	# Get the desired elements
	text = get_text(trimmed_tree, site_config["text_elements"])
	return " ".join(text)