secret
Last active

  • Download Gist
sel-update.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
# To run this script, from the command line type:
# python scholar_selenium.py
# The results will be stored in a file called results.csv
 
# Please make sure you have installed the following:
# * Python 2.6 or higher (not Python 3)
# * Latest version of Firefox
# * Selenium for Python [from the command line, type pip install selenium]
 
# If you don't have pip installed, you need to install it:
# Windows: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows
# Mac: You should be able to type easy_install pip from the command line
 
 
# You must also set firefox to automatically download files:
# http://kb.mozillazine.org/File_types_and_download_actions
 
# Still to do: write checks for what happens when you have too many results or too few, and adjust the delta time accordingly
# Add in the other options for searching
 
 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
import selenium
import datetime
import time
import re
import csv
 
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
 
## Add in what you need here:
terms = '"sars"'
 
## Date to begin
# year,m,d
start_date = datetime.date(2003,1,1)
## Date to end
# year,m,d
end_date = datetime.date(2013,1,1)
 
browser = webdriver.Firefox()
wait = ui.WebDriverWait(browser,10)
 
firsttime = True
 
d = start_date
delta = datetime.timedelta(days=1)
while d <= end_date:
thisurl = 'http://www.lexisnexis.com.ezp-prod1.hul.harvard.edu/hottopics/lnacademic/?shr=t&sfi=AC01NBSimplSrch'
#loaderr = False
#while loaderr:
try:
browser.get(thisurl)
except:
browser.switch_to_window(browser.window_handles[0])
browser.get(thisurl)
while not isReady(browser):
time.sleep(1)
 
browser.switch_to_frame("mainFrame")
 
dateoptions = browser.find_elements_by_xpath('//*[@id="dateSelector1"]')[0]
select = ui.Select(dateoptions)
select.select_by_value("from")
 
startbox = browser.find_elements_by_xpath('//*[@id="fromDate1"]')[0]
startbox.send_keys(d.strftime("%m/%d/%Y"))
endbox = browser.find_elements_by_xpath('//*[@id="toDate1"]')[0]
endbox.send_keys((d+delta).strftime("%m/%d/%Y"))
 
searchbox = browser.find_elements_by_xpath('.//input[@id="terms"]')[0]
searchbox.send_keys(terms + Keys.RETURN)
#fs_main > frame:nth-child(2)
while not isReady(browser):
time.sleep(1)
 
loaded = False
while not loaded:
try:
browser.switch_to_default_content()
browser.switch_to_frame(1)
browser.switch_to_frame(1)
loaded = True
except:
time.sleep(1)
browser.switch_to_default_content()
# after the page has loaded
# get the second frame
browser.switch_to_default_content()
browser.switch_to_frame(1)
# and the second frame within that frame
browser.switch_to_frame(1)
 
# how many documents are there?
nresults = int(browser.find_elements_by_xpath('//span[@class="l3b paginationalign"]')[0].text.split("of")[1])
remainingresults = nresults
lastresult = 0
dcomplete = False
 
while dcomplete == False:
browser.switch_to_default_content()
browser.switch_to_frame(1)
# and the second frame within that frame
browser.switch_to_frame(1)
 
browser.find_elements_by_xpath('.//img[@title="Download Documents"]')[0].click()
browser.switch_to_window(browser.window_handles[1])
while not isReady(browser):
time.sleep(1)
# then, it pops up
downloadoptions = browser.find_elements_by_xpath('//*[@id="delFmt"]')[0]
select = ui.Select(downloadoptions)
select.select_by_value("QDS_EF_GENERICTYPE")
 
if nresults>500:
print "More than 500 results:", nresults
rangebox = browser.find_elements_by_xpath('.//input[@id="rangetextbox"]')[0]
nextresult = min(lastresult+500,nresults)
rangebox.send_keys("{0}-{1}".format(lastresult+1,nextresult))
print "clicking rangebox..."
rangebox.click()
time.sleep(1)
lastresult = nextresult
if lastresult==nresults:
dcomplete = True
else:
dcomplete = True
 
browser.find_elements_by_xpath('//img[@title="Download"]')[0].click()
 
# then, it takes a while to load the download link
 
while not isReady(browser):
time.sleep(1)
 
while (len(browser.find_elements_by_xpath('//center[@class="suspendbox"]'))==0):
time.sleep(1)
 
# we set up to download automatically, otherwise, it won't work.
browser.find_elements_by_xpath('//center[@class="suspendbox"]//a')[0].click()
 
# if it's our first time, wait 10 seconds
if firsttime:
time.sleep(10)
firsttime = False
 
# wait a second to ensure the downlaod is going
time.sleep(1)
 
# close the window
closed = False
while not closed:
try:
print "trying to close..."
browser.find_elements_by_xpath('.//img[@title="Close Window"]')[0].click()
closed = True
except:
print "couldn't close...sleeping"
time.sleep(1)
browser.switch_to_window(browser.window_handles[0])
# all_frames = browser.find_elements_by_xpath('//frame')
# for f in all_frames:
# print f.get_attribute('title')
d = d+delta

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.