Skip to content

Instantly share code, notes, and snippets.

@Nydhal
Created August 10, 2018 20:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nydhal/49acab8739e9918127f68ba9cae474e5 to your computer and use it in GitHub Desktop.
Save Nydhal/49acab8739e9918127f68ba9cae474e5 to your computer and use it in GitHub Desktop.
# coding=utf-8
import urllib3 as u
from bs4 import BeautifulSoup
from tabulate import tabulate
import pandas as pd
digikey_id_list = ['product-details', 'prod-att-table'] # Digi-Key tables
def get_soup(url):
"""
Args:
url: URL for the web page to be parsed
Returns: BeautifulSoup object
(https://www.crummy.com/software/BeautifulSoup/bs4/doc/#beautifulsoup)
"""
user_agent = ['Mozilla/5.0', ''][0]
http_pool = u.connection_from_url(url, headers={'user-agent': user_agent})
raw_html = http_pool.urlopen('GET', url)
return BeautifulSoup(raw_html.data, 'html.parser')
def table_to_df(soup, id_list):
""" Return a pandas data_frame from the HTML tables found in the soup
based on the table ids
"""
data_frame = pd.DataFrame()
for t in id_list:
data_frame = data_frame.append(
pd.read_html(str(soup.find('table', attrs={'id': t})))[0])
return data_frame
def product_image(soup):
""" Parse image of product
"""
return 'https:' + soup.find('div', attrs={'id': 'product-photo-wrapper'}) \
.find('img')['src']
df = table_to_df(get_soup('http://www.digikey.com/'
'product-detail/en/microchip-technology/ \
PIC10F220T-I-OT/PIC10F220T-I-OTCT-ND/1015705'),
digikey_id_list)
# df = table_to_df(get_soup(
# 'https://www.digikey.com/products/en?keywords=490-1318-1-ND'),
# digikey_id_list)
print(tabulate(df, headers='keys', tablefmt='fancy_grid'))
print(product_image(get_soup('http://www.digikey.com/'
'product-detail/en/microchip-technology/ \
+PIC10F220T-I-OT/PIC10F220T-I-OTCT-ND/1015705')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment