Skip to content

Instantly share code, notes, and snippets.

@BetterProgramming
Created July 21, 2020 17:09
Show Gist options
  • Save BetterProgramming/51536bc8ba53d941da5f81ec1bf00a48 to your computer and use it in GitHub Desktop.
Save BetterProgramming/51536bc8ba53d941da5f81ec1bf00a48 to your computer and use it in GitHub Desktop.
# Get outer element
def outer_element(page, identifier):
root = page.find(*identifier)
if root == None:
raise Exception("Could not find root element")
return root
# Remove unwanted elements
def trim_unwanted(page, identifier_list):
# Check if list has elements
if len(identifier_list) != 0:
for identifier in identifier_list:
for element in page.find_all(*identifier):
element.decompose()
return page
# Extract text
def get_text(page, identifier_list):
# Check if list has elements
if len(identifier_list) == 0:
raise Exception("Need text elements")
page_text = []
for identifier in identifier_list:
for element in page.find_all(*identifier):
page_text.append(element.text)
return page_text
# Get page config
def load_scrape_config():
'''Loads page scraping config data'''
return get_scrape_config()
# Get the scraping config for the site
def get_site_config(url):
'''Get the scrape config for the site'''
domain = extract_domain(url)
config_data = load_scrape_config()
config = config_data.get(domain, None)
if config == None:
raise Exception(f"Config does not exist for: {domain}")
return config
# Build Soup
def page_processer(request):
'''Returns Article Text'''
# Get the page scrape config
site_config = get_site_config(request.url)
# Soupify page
soup = BeautifulSoup(request.text, 'lxml')
# Retrieve root element
root = outer_element(soup, site_config["root_element"])
# Remove unwanted elements
trimmed_tree = trim_unwanted(root, site_config["unwanted"])
# Get the desired elements
text = get_text(trimmed_tree, site_config["text_elements"])
return " ".join(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment