Last active
October 6, 2020 09:00
-
-
Save ting11222001/f1c9d1ecd2fd097ac78a0872a44e42ac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#載入selenium套件 | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import time | |
#我的chromedriver路徑 | |
chrome_driver_path = '/Users/ting11222001/Downloads/chromedriver' | |
#第一個頁面的url | |
url = 'https://gogakuru.com/english/phrase/genre/180_%E5%88%9D%E7%B4%9A%E3%83%AC%E3%83%99%E3%83%AB.html?layoutPhrase=1&orderPhrase=1&condMovie=0&flow=enSearchGenre&condGenre=180&perPage=50' | |
#可以不讓瀏覽器執行在前景,而是在背景執行(不讓我們肉眼看得見),如以下宣告 options | |
options = webdriver.ChromeOptions() | |
options.add_argument('--headless') | |
driver = webdriver.Chrome(chrome_options=options, executable_path=chrome_driver_path) | |
# implicitly_wait隱性等待5秒,等網頁載入完成才執行下一步 | |
driver.implicitly_wait(5) | |
driver.get(url) | |
#每個例句會放進這個list | |
results = [] | |
#手動定義總頁數 | |
pages = 187 | |
for page in range(1, pages+1): | |
print('Now is: Page ', page) | |
print('Working...') | |
#selenium抓取每個例句 | |
items = driver.find_elements_by_xpath("//span[@class='font-en']") | |
for item in items: | |
results.append(item.text) | |
print('Done!') | |
#如果是最後一頁,就不繼續執行”點擊下一頁“這個動作,如果不是,就點擊次ヘ,也就是“下一頁”功能鍵 | |
if page == pages: | |
break | |
else: | |
driver.find_element_by_xpath("//span[@class='right']/a").click() | |
#強制停3秒再開始下一個循環 | |
time.sleep(3) | |
#印出結果 | |
print('===Final Results===') | |
print(results) | |
#關閉selenium driver | |
driver.quit() | |
#將每個例句寫進.txt檔 | |
with open('japan.txt', 'a', encoding='utf8') as file: | |
for i in results: | |
file.write(i) | |
file.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment