Last active
August 29, 2018 07:00
-
-
Save VioletVivirand/9372145e9e6adeac33191b6423918aec to your computer and use it in GitHub Desktop.
Taobao Web Spider Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.firefox.options import Options | |
from selenium.webdriver import Firefox | |
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |
url = 'https://s.taobao.com/search?q=%E5%B0%BF%E8%A4%B2&type=p&tmhkh5=&spm=a21wu.241046-tw.a2227oh.d100&from=sea_1_searchbutton&catId=100' | |
# Firefox version | |
# Ref: https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode#Selenium_in_Python | |
options = Options() | |
options.add_argument('-headless') | |
binary = FirefoxBinary('path/to/installed firefox binary') | |
driver = Firefox(firefox_binary=binary, firefox_options=options) | |
# PhantomJS version | |
driver = webdriver.PhantomJS() # WARNING: Deprecated | |
driver.get(url) | |
t = driver.page_source | |
soup = BeautifulSoup(t, 'html5lib') | |
boxes = soup.find_all('div', {'class':'ctx-box'}) | |
# href = boxes[0].find('div', {'class': ['row-2']}).a.get('href') | |
hrefs = [b.find('div', {'class': ['row-2']}).a.get('href') for b in boxes] | |
# Title = list(boxes[0].find('div', {'class': ['row-2']}).a.stripped_strings)[0] | |
titles = [list(b.find('div', {'class': ['row-2']}).a.stripped_strings)[0] for b in boxes] | |
r = requests.get(hrefs[0], allow_redirects=False) | |
r.url | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment