Skip to content

Instantly share code, notes, and snippets.

@dradecic
Created September 12, 2019 08:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dradecic/2cf982730b0f9f1666a0117c9987ffe9 to your computer and use it in GitHub Desktop.
Save dradecic/2cf982730b0f9f1666a0117c9987ffe9 to your computer and use it in GitHub Desktop.
scraping-1-script
def get_books(topic_list):
# Generate full URLs from the argument provided by user
all_urls = []
for topic in topic_list:
all_urls.append('http://books.toscrape.com/catalogue/category/books/{}/index.html'.format(topic))
# Instantiate an empty list for holding the dictionary objects
all_books = []
# Inform the user that the scraping has started, and when it started
start_time = datetime.datetime.now()
print('Book Scraping in Progress... Time Started: {}'.format(datetime.datetime.strftime(start_time, '%d.%m.%Y %H:%M:%S')))
# Iterate over every URL
for url in all_urls:
# Fetch HTML from it
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
# Topic can be extracted from the URL itself
# I've also removed everything that isn't necessary - '_2' from 'travel_2' for example
curr_topic = url.split('/')[-2].split('_')[0]
# article tag is your starting point, as discussed earlier
books = soup.find_all('article', attrs={'class': 'product_pod'})
# For every article tag on the webpage
for book in books:
# Initialize the variables so the error isn't thrown if data isn't found
book_title = ''
book_link = ''
thumbnail_link = ''
rating = ''
price = ''
availability = ''
# Check if title exists - if does, update book_title
if book.find('h3').find('a') != None:
book_title = book.find('h3').find('a').get('title')
# Check if link exists - if does, update book_link and thumbnail_link
if book.find('div', attrs={'class': 'image_container'}).find('a') != None:
base_book_url = 'http://books.toscrape.com/catalogue/'
book_url = book.find('div', attrs={'class': 'image_container'}).find('a').get('href')
# some weird format, feel free to explore further on your own
book_link = base_book_url + book_url.split('../')[-1]
base_thumbnail_url = 'http://books.toscrape.com/'
thumbnail_url = book.find('div', attrs={'class': 'image_container'}).find('img').get('src')
thumbnail_link = base_thumbnail_url + thumbnail_url.split('../')[-1]
# Check if rating exists - if does, update rating
if book.find('p', attrs={'class': 'star-rating'}) != None:
rating = book.find('p', attrs={'class': 'star-rating'}).get('class')[-1]
# Check if price and availability exists - if does, update them
if book.find('div', attrs={'class': 'product_price'}) != None:
price = book.find('div', attrs={'class': 'product_price'}).find('p', attrs={'class': 'price_color'}).get_text()
availability = book.find('div', attrs={'class': 'product_price'}).find('p', attrs={'class': 'instock availability'}).get_text().strip()
# Append the list with Ordered Dictionary from those variables
all_books.append(OrderedDict({
'Topic' : curr_topic,
'Title' : book_title,
'Rating' : rating,
'Price' : price,
'Availability' : availability,
'Link' : book_link,
'Thumbnail' : thumbnail_link
}))
# Inform the user that scraping has finished and report how long it took
end_time = datetime.datetime.now()
duration = int((end_time - start_time).total_seconds())
print('Scraping Finished!')
print('\tIt took {} seconds to scrape {} books'.format(duration, len(all_books)))
# Return Pandas DataFrame representation of the list
return pd.DataFrame(all_books)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment