Skip to content

Instantly share code, notes, and snippets.

@JimFawkes
Last active January 28, 2019 21:47
Show Gist options
  • Save JimFawkes/76a649c7bdf8bfbbc2b2051c98995789 to your computer and use it in GitHub Desktop.
Save JimFawkes/76a649c7bdf8bfbbc2b2051c98995789 to your computer and use it in GitHub Desktop.
Example Code used in my Metis Investigation Presentation
"""Better Exceptions and Loguru
Metis Investigation by Moritz Eilfort
January 28th, 2019
Example code for presentation purposes.
Code: https://gist.github.com/JimFawkes/76a649c7bdf8bfbbc2b2051c98995789
Summary: https://gist.github.com/JimFawkes/e5f767288e6d8e2df8fa53b5862db9d6
"""
import random
from loguru import logger
logger.add("logs/my_first_log_file.log")
def scrape_data(url):
"""Get data from a website."""
data_option_1 = {
"status_code": 200.0,
"content": " Yeah#it#worked.\n# ",
"url": url,
}
data_option_2 = {"status_code": 200, "content": None, "url": url}
data_option_3 = {
"status_code": 403,
"error": {"msg": "API RATE LIMIT EXCEEDED!"},
"url": url,
}
return random.choice([data_option_1, data_option_2, data_option_3])
def clean_content(content):
"""Remove leading and trailing whitespaces and remove strange characters.
"""
data_content = content.replace("#", " ").replace("\n", "").strip()
return data_content
def clean(data):
"""Clean the status_code and content."""
data["status_code"] = int(data["status_code"])
data["content"] = clean_content(data["content"])
return data
def get_clean_data_from_url(url="www.some-website.com"):
"""Run our entire pipeline.
1. Get the data by scraping the url
2. Clean the data
3. Return the data
"""
data = scrape_data(url)
clean_data = clean(data)
return clean_data
def scrape_all_pages(pages):
"""Scrape all sub-pages of a website."""
results = []
for page in pages:
logger.info(f"Get data for page {page}")
try:
clean_data = get_clean_data_from_url(page)
results.append((page, clean_data))
except AttributeError as e:
logger.warning(
f"Caught an AttributeError when retrieving website: {page}. Re-trying..."
)
results += scrape_all_pages([page])
except KeyError as e:
logger.exception(
f"Caught a KeyError when retrieving website: {page}. Ignoring this site. Solve later..."
)
# logger.error(e)
continue
return results
if __name__ == "__main__":
pages = [
"www.thisismetis.com",
"www.google.com",
"www.github.com",
"www.stackoverflow.com",
"www.some-website.com",
]
results = scrape_all_pages(pages)
print(f"Got {len(results)}/{len(pages)} results:")
for result in results:
print(f"{result[0]} - {result[1]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment