Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Last active December 22, 2015 22:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexstorer/a6d2279fad0821d703b5 to your computer and use it in GitHub Desktop.
Save alexstorer/a6d2279fad0821d703b5 to your computer and use it in GitHub Desktop.
# To run this script, from the command line type:
# python scholar_selenium.py
# The results will be stored in a file called results.csv
# Please make sure you have installed the following:
# * Python 2.6 or higher (not Python 3)
# * Latest version of Firefox
# * Selenium for Python [from the command line, type pip install selenium]
# If you don't have pip installed, you need to install it:
# Windows: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows
# Mac: You should be able to type easy_install pip from the command line
# You must also set firefox to automatically download files:
# http://kb.mozillazine.org/File_types_and_download_actions
# Still to do: write checks for what happens when you have too many results or too few, and adjust the delta time accordingly
# Add in the other options for searching
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
import selenium
import datetime
import time
import re
import csv
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
## Add in what you need here:
terms = '"sars"'
## Date to begin
# year,m,d
start_date = datetime.date(2003,1,1)
## Date to end
# year,m,d
end_date = datetime.date(2013,1,1)
browser = webdriver.Firefox()
wait = ui.WebDriverWait(browser,10)
firsttime = True
d = start_date
delta = datetime.timedelta(days=1)
while d <= end_date:
thisurl = 'http://www.lexisnexis.com.ezp-prod1.hul.harvard.edu/hottopics/lnacademic/?shr=t&sfi=AC01NBSimplSrch'
#loaderr = False
#while loaderr:
try:
browser.get(thisurl)
except:
browser.switch_to_window(browser.window_handles[0])
browser.get(thisurl)
while not isReady(browser):
time.sleep(1)
browser.switch_to_frame("mainFrame")
dateoptions = browser.find_elements_by_xpath('//*[@id="dateSelector1"]')[0]
select = ui.Select(dateoptions)
select.select_by_value("from")
startbox = browser.find_elements_by_xpath('//*[@id="fromDate1"]')[0]
startbox.send_keys(d.strftime("%m/%d/%Y"))
endbox = browser.find_elements_by_xpath('//*[@id="toDate1"]')[0]
endbox.send_keys((d+delta).strftime("%m/%d/%Y"))
searchbox = browser.find_elements_by_xpath('.//input[@id="terms"]')[0]
searchbox.send_keys(terms + Keys.RETURN)
#fs_main > frame:nth-child(2)
while not isReady(browser):
time.sleep(1)
loaded = False
while not loaded:
try:
browser.switch_to_default_content()
browser.switch_to_frame(1)
browser.switch_to_frame(1)
loaded = True
except:
time.sleep(1)
browser.switch_to_default_content()
# after the page has loaded
# get the second frame
browser.switch_to_default_content()
browser.switch_to_frame(1)
# and the second frame within that frame
browser.switch_to_frame(1)
# how many documents are there?
nresults = int(browser.find_elements_by_xpath('//span[@class="l3b paginationalign"]')[0].text.split("of")[1])
remainingresults = nresults
lastresult = 0
dcomplete = False
while dcomplete == False:
browser.switch_to_default_content()
browser.switch_to_frame(1)
# and the second frame within that frame
browser.switch_to_frame(1)
browser.find_elements_by_xpath('.//img[@title="Download Documents"]')[0].click()
browser.switch_to_window(browser.window_handles[1])
while not isReady(browser):
time.sleep(1)
# then, it pops up
downloadoptions = browser.find_elements_by_xpath('//*[@id="delFmt"]')[0]
select = ui.Select(downloadoptions)
select.select_by_value("QDS_EF_GENERICTYPE")
if nresults>500:
print "More than 500 results:", nresults
rangebox = browser.find_elements_by_xpath('.//input[@id="rangetextbox"]')[0]
nextresult = min(lastresult+500,nresults)
rangebox.send_keys("{0}-{1}".format(lastresult+1,nextresult))
print "clicking rangebox..."
rangebox.click()
time.sleep(1)
lastresult = nextresult
if lastresult==nresults:
dcomplete = True
else:
dcomplete = True
browser.find_elements_by_xpath('//img[@title="Download"]')[0].click()
# then, it takes a while to load the download link
while not isReady(browser):
time.sleep(1)
while (len(browser.find_elements_by_xpath('//center[@class="suspendbox"]'))==0):
time.sleep(1)
# we set up to download automatically, otherwise, it won't work.
browser.find_elements_by_xpath('//center[@class="suspendbox"]//a')[0].click()
# if it's our first time, wait 10 seconds
if firsttime:
time.sleep(10)
firsttime = False
# wait a second to ensure the downlaod is going
time.sleep(1)
# close the window
closed = False
while not closed:
try:
print "trying to close..."
browser.find_elements_by_xpath('.//img[@title="Close Window"]')[0].click()
closed = True
except:
print "couldn't close...sleeping"
time.sleep(1)
browser.switch_to_window(browser.window_handles[0])
# all_frames = browser.find_elements_by_xpath('//frame')
# for f in all_frames:
# print f.get_attribute('title')
d = d+delta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment