Skip to content

Instantly share code, notes, and snippets.

@VioletVivirand
Last active August 29, 2018 07:00
Show Gist options
  • Save VioletVivirand/9372145e9e6adeac33191b6423918aec to your computer and use it in GitHub Desktop.
Save VioletVivirand/9372145e9e6adeac33191b6423918aec to your computer and use it in GitHub Desktop.
Taobao Web Spider Example
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
url = 'https://s.taobao.com/search?q=%E5%B0%BF%E8%A4%B2&type=p&tmhkh5=&spm=a21wu.241046-tw.a2227oh.d100&from=sea_1_searchbutton&catId=100'
# Firefox version
# Ref: https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode#Selenium_in_Python
options = Options()
options.add_argument('-headless')
binary = FirefoxBinary('path/to/installed firefox binary')
driver = Firefox(firefox_binary=binary, firefox_options=options)
# PhantomJS version
driver = webdriver.PhantomJS() # WARNING: Deprecated
driver.get(url)
t = driver.page_source
soup = BeautifulSoup(t, 'html5lib')
boxes = soup.find_all('div', {'class':'ctx-box'})
# href = boxes[0].find('div', {'class': ['row-2']}).a.get('href')
hrefs = [b.find('div', {'class': ['row-2']}).a.get('href') for b in boxes]
# Title = list(boxes[0].find('div', {'class': ['row-2']}).a.stripped_strings)[0]
titles = [list(b.find('div', {'class': ['row-2']}).a.stripped_strings)[0] for b in boxes]
r = requests.get(hrefs[0], allow_redirects=False)
r.url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment