-
-
Save alexstorer/a6d2279fad0821d703b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To run this script, from the command line type: | |
# python scholar_selenium.py | |
# The results will be stored in a file called results.csv | |
# Please make sure you have installed the following: | |
# * Python 2.6 or higher (not Python 3) | |
# * Latest version of Firefox | |
# * Selenium for Python [from the command line, type pip install selenium] | |
# If you don't have pip installed, you need to install it: | |
# Windows: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows | |
# Mac: You should be able to type easy_install pip from the command line | |
# You must also set firefox to automatically download files: | |
# http://kb.mozillazine.org/File_types_and_download_actions | |
# Still to do: write checks for what happens when you have too many results or too few, and adjust the delta time accordingly | |
# Add in the other options for searching | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import selenium.webdriver.support.ui as ui | |
import selenium | |
import datetime | |
import time | |
import re | |
import csv | |
def isReady(browser): | |
return browser.execute_script("return document.readyState")=="complete" | |
## Add in what you need here: | |
terms = '"sars"' | |
## Date to begin | |
# year,m,d | |
start_date = datetime.date(2003,1,1) | |
## Date to end | |
# year,m,d | |
end_date = datetime.date(2013,1,1) | |
browser = webdriver.Firefox() | |
wait = ui.WebDriverWait(browser,10) | |
firsttime = True | |
d = start_date | |
delta = datetime.timedelta(days=1) | |
while d <= end_date: | |
thisurl = 'http://www.lexisnexis.com.ezp-prod1.hul.harvard.edu/hottopics/lnacademic/?shr=t&sfi=AC01NBSimplSrch' | |
#loaderr = False | |
#while loaderr: | |
try: | |
browser.get(thisurl) | |
except: | |
browser.switch_to_window(browser.window_handles[0]) | |
browser.get(thisurl) | |
while not isReady(browser): | |
time.sleep(1) | |
browser.switch_to_frame("mainFrame") | |
dateoptions = browser.find_elements_by_xpath('//*[@id="dateSelector1"]')[0] | |
select = ui.Select(dateoptions) | |
select.select_by_value("from") | |
startbox = browser.find_elements_by_xpath('//*[@id="fromDate1"]')[0] | |
startbox.send_keys(d.strftime("%m/%d/%Y")) | |
endbox = browser.find_elements_by_xpath('//*[@id="toDate1"]')[0] | |
endbox.send_keys((d+delta).strftime("%m/%d/%Y")) | |
searchbox = browser.find_elements_by_xpath('.//input[@id="terms"]')[0] | |
searchbox.send_keys(terms + Keys.RETURN) | |
#fs_main > frame:nth-child(2) | |
while not isReady(browser): | |
time.sleep(1) | |
loaded = False | |
while not loaded: | |
try: | |
browser.switch_to_default_content() | |
browser.switch_to_frame(1) | |
browser.switch_to_frame(1) | |
loaded = True | |
except: | |
time.sleep(1) | |
browser.switch_to_default_content() | |
# after the page has loaded | |
# get the second frame | |
browser.switch_to_default_content() | |
browser.switch_to_frame(1) | |
# and the second frame within that frame | |
browser.switch_to_frame(1) | |
# how many documents are there? | |
nresults = int(browser.find_elements_by_xpath('//span[@class="l3b paginationalign"]')[0].text.split("of")[1]) | |
remainingresults = nresults | |
lastresult = 0 | |
dcomplete = False | |
while dcomplete == False: | |
browser.switch_to_default_content() | |
browser.switch_to_frame(1) | |
# and the second frame within that frame | |
browser.switch_to_frame(1) | |
browser.find_elements_by_xpath('.//img[@title="Download Documents"]')[0].click() | |
browser.switch_to_window(browser.window_handles[1]) | |
while not isReady(browser): | |
time.sleep(1) | |
# then, it pops up | |
downloadoptions = browser.find_elements_by_xpath('//*[@id="delFmt"]')[0] | |
select = ui.Select(downloadoptions) | |
select.select_by_value("QDS_EF_GENERICTYPE") | |
if nresults>500: | |
print "More than 500 results:", nresults | |
rangebox = browser.find_elements_by_xpath('.//input[@id="rangetextbox"]')[0] | |
nextresult = min(lastresult+500,nresults) | |
rangebox.send_keys("{0}-{1}".format(lastresult+1,nextresult)) | |
print "clicking rangebox..." | |
rangebox.click() | |
time.sleep(1) | |
lastresult = nextresult | |
if lastresult==nresults: | |
dcomplete = True | |
else: | |
dcomplete = True | |
browser.find_elements_by_xpath('//img[@title="Download"]')[0].click() | |
# then, it takes a while to load the download link | |
while not isReady(browser): | |
time.sleep(1) | |
while (len(browser.find_elements_by_xpath('//center[@class="suspendbox"]'))==0): | |
time.sleep(1) | |
# we set up to download automatically, otherwise, it won't work. | |
browser.find_elements_by_xpath('//center[@class="suspendbox"]//a')[0].click() | |
# if it's our first time, wait 10 seconds | |
if firsttime: | |
time.sleep(10) | |
firsttime = False | |
# wait a second to ensure the downlaod is going | |
time.sleep(1) | |
# close the window | |
closed = False | |
while not closed: | |
try: | |
print "trying to close..." | |
browser.find_elements_by_xpath('.//img[@title="Close Window"]')[0].click() | |
closed = True | |
except: | |
print "couldn't close...sleeping" | |
time.sleep(1) | |
browser.switch_to_window(browser.window_handles[0]) | |
# all_frames = browser.find_elements_by_xpath('//frame') | |
# for f in all_frames: | |
# print f.get_attribute('title') | |
d = d+delta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment