Skip to content

Instantly share code, notes, and snippets.

@opyapeus
Last active August 9, 2018 17:31
Show Gist options
  • Save opyapeus/821079f96849a8fd2da8873b5dacd22b to your computer and use it in GitHub Desktop.
Save opyapeus/821079f96849a8fd2da8873b5dacd22b to your computer and use it in GitHub Desktop.
Amazon large image scraping from ASIN
# python3
from urllib import request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import re
import os
SAVE_DIR = 'imgs'
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36'
AMAZON_BASE_URL = 'https://www.amazon.co.jp/dp'
asins = ['B01J6RPOJY', 'B072B5BTLK'] # Ex: Fire HD 10 32GB, Amazon Echo Dot
# create save dir
[os.makedirs(os.path.join(SAVE_DIR, asin), exist_ok=True) for asin in asins]
# set user agent
ua = dict(DesiredCapabilities.PHANTOMJS)
ua['phantomjs.page.settings.userAgent'] = (USER_AGENT)
# start driver
driver = webdriver.PhantomJS(desired_capabilities=ua)
for asin in asins:
# fetch html source
url = os.path.join(AMAZON_BASE_URL, asin)
driver.get(url)
# NOTE: to show large image url list (needs hovering thumbnail)
thumb_list = driver.find_elements_by_css_selector('li.imageThumbnail')
[thumb.click() for thumb in thumb_list]
# read html as bs4
soup = BeautifulSoup(driver.page_source, 'html5lib')
#NOTE: get img tags (condition is temporary)
img_tags = soup.select('.image img')
for idx, img_tag in enumerate(img_tags):
url = img_tag['src']
# fix large image url (xxx._YYYYY_.jpg -> xxx.jpg)
large_img_url = re.sub(r'\._.*_', '', url)
# create save path
_, ext = os.path.splitext(url)
fn = str(idx) + ext
save_path = os.path.join(SAVE_DIR, asin, fn)
# save
request.urlretrieve(large_img_url, save_path)
# end driver
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment