Skip to content

Instantly share code, notes, and snippets.

@pablospizzamiglio
Created July 18, 2020 12:55
Show Gist options
  • Save pablospizzamiglio/412412bc8de2f1e08cb3a2dc8f781eae to your computer and use it in GitHub Desktop.
Save pablospizzamiglio/412412bc8de2f1e08cb3a2dc8f781eae to your computer and use it in GitHub Desktop.
Web Scraper that gets product information from Lenovo's e-commerce site
from contextlib import closing
# pip install beautifulsoup4 requests
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as response:
if is_good_response(response):
return response.content
else:
return None
except RequestException as e:
log_error("Error during requests to {0} : {1}".format(url, str(e)))
return None
def is_good_response(response):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = response.headers["Content-Type"].lower()
return (
response.status_code == 200
and content_type is not None
and "html" in content_type
)
def log_error(e):
"""
It is always a good idea to log errors.
This function just prints them, but you can
make it do anything.
"""
print(e)
def get_product_metadata(url):
"""
Downloads the page where the Product detail is found and returns a
dictionary containing the relevant metadata.
"""
response = simple_get(url)
if response is not None:
html = BeautifulSoup(response, "html.parser")
META_NAMES = [
"description",
"productcode",
"productid",
"productprice",
"productsaleprice"
"productstatus",
]
return {
meta.attrs["name"]: meta.attrs["content"]
for meta in html.select("meta")
if "name" in meta.attrs and meta.attrs["name"] in META_NAMES
}
# Raise an exception if we failed to get any data from the url
raise Exception("Error retrieving contents at {}".format(url))
if __name__ == "__main__":
get_product_metadata("https://www.lenovo.com/us/en/laptops/thinkpad/thinkpad-x/ThinkPad-X1-Carbon-6th-Gen/p/22TP2TXX16G")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment