Created
May 6, 2020 14:37
-
-
Save kunalrustagi08/5af492f30e3fad02a2b2b9a8b51e1ec7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import random | |
import numpy as np | |
from multiprocessing import Pool | |
url_list = [] | |
pages = np.arange(1,51,1) | |
def generate_urls(): | |
for page in pages: | |
url = 'http://books.toscrape.com/catalogue/page-' + str(page) + '.html' | |
url_list.append(url) | |
def scrape_url(url): | |
book_title = [] | |
star_rating = [] | |
product_price = [] | |
time.sleep(random.randint(1,10)) | |
results = requests.get(url) | |
soup = BeautifulSoup(results.text, 'html.parser') | |
book_div = soup.find_all('li', class_='col-xs-6 col-sm-4 col-md-3 col-lg-3') | |
for container in book_div: | |
title = container.article.h3.a['title'] | |
book_title.append(title) | |
price = container.article.find('div', class_='product_price').p.text | |
product_price.append(price) | |
rating = container.article.p['class'][-1] | |
star_rating.append(rating) | |
return (book_title, product_price, star_rating) | |
generate_urls() | |
start = time.time() | |
p = Pool(10) | |
book_list = p.map(scrape_url, url_list) | |
p.terminate() | |
p.join() | |
end = time.time() | |
print('It took', (end-start), 'seconds') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment