alexstorer/sel-update.py Secret

## sel-update.py
# To run this script, from the command line type:
# python scholar_selenium.py
# The results will be stored in a file called results.csv

# Please make sure you have installed the following:
# * Python 2.6 or higher (not Python 3)
# * Latest version of Firefox
# * Selenium for Python [from the command line, type pip install selenium]

# If you don't have pip installed, you need to install it:
# Windows: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows
# Mac: You should be able to type easy_install pip from the command line


# You must also set firefox to automatically download files:
# http://kb.mozillazine.org/File_types_and_download_actions

# Still to do: write checks for what happens when you have too many results or too few, and adjust the delta time accordingly
# Add in the other options for searching


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
import selenium
import datetime
import time
import re
import csv

def isReady(browser):
    return browser.execute_script("return document.readyState")=="complete"

## Add in what you need here:
terms = '"sars"'

## Date to begin
#                      year,m,d
start_date = datetime.date(2003,1,1)
## Date to end
#                    year,m,d
end_date = datetime.date(2013,1,1)

browser = webdriver.Firefox()
wait = ui.WebDriverWait(browser,10)

firsttime = True

d = start_date
delta = datetime.timedelta(days=1)
while d <= end_date:
    thisurl = 'http://www.lexisnexis.com.ezp-prod1.hul.harvard.edu/hottopics/lnacademic/?shr=t&sfi=AC01NBSimplSrch'
    #loaderr = False
    #while loaderr:
    try:
        browser.get(thisurl)
    except:
        browser.switch_to_window(browser.window_handles[0])
        browser.get(thisurl)
    while not isReady(browser):
        time.sleep(1)

    browser.switch_to_frame("mainFrame")

    dateoptions = browser.find_elements_by_xpath('//*[@id="dateSelector1"]')[0]
    select = ui.Select(dateoptions)
    select.select_by_value("from")

    startbox = browser.find_elements_by_xpath('//*[@id="fromDate1"]')[0]
    startbox.send_keys(d.strftime("%m/%d/%Y"))
    endbox = browser.find_elements_by_xpath('//*[@id="toDate1"]')[0]
    endbox.send_keys((d+delta).strftime("%m/%d/%Y"))

    searchbox = browser.find_elements_by_xpath('.//input[@id="terms"]')[0]
    searchbox.send_keys(terms + Keys.RETURN)
        #fs_main > frame:nth-child(2)
    while not isReady(browser):
        time.sleep(1)

    loaded = False
    while not loaded:
        try:
            browser.switch_to_default_content()
            browser.switch_to_frame(1)
            browser.switch_to_frame(1)
            loaded = True
        except:
            time.sleep(1)
            browser.switch_to_default_content()


    # after the page has loaded
    # get the second frame
    browser.switch_to_default_content()
    browser.switch_to_frame(1)
    # and the second frame within that frame
    browser.switch_to_frame(1)

    # how many documents are there?
    nresults = int(browser.find_elements_by_xpath('//span[@class="l3b paginationalign"]')[0].text.split("of")[1])
    remainingresults = nresults
    lastresult = 0
    dcomplete = False

    while dcomplete == False:
        browser.switch_to_default_content()
        browser.switch_to_frame(1)
        # and the second frame within that frame
        browser.switch_to_frame(1)

        browser.find_elements_by_xpath('.//img[@title="Download Documents"]')[0].click()
        browser.switch_to_window(browser.window_handles[1])
        while not isReady(browser):
            time.sleep(1)

        # then, it pops up
        downloadoptions = browser.find_elements_by_xpath('//*[@id="delFmt"]')[0]
        select = ui.Select(downloadoptions)
        select.select_by_value("QDS_EF_GENERICTYPE")

        if nresults>500:
            print "More than 500 results:", nresults
            rangebox = browser.find_elements_by_xpath('.//input[@id="rangetextbox"]')[0]
            nextresult = min(lastresult+500,nresults)
            rangebox.send_keys("{0}-{1}".format(lastresult+1,nextresult))
            print "clicking rangebox..."
            rangebox.click()
            time.sleep(1)
            lastresult = nextresult
            if lastresult==nresults:
                dcomplete = True
        else:
            dcomplete = True

        browser.find_elements_by_xpath('//img[@title="Download"]')[0].click()

        # then, it takes a while to load the download link

        while not isReady(browser):
            time.sleep(1)

        while (len(browser.find_elements_by_xpath('//center[@class="suspendbox"]'))==0):
            time.sleep(1)

        # we set up to download automatically, otherwise, it won't work.
        browser.find_elements_by_xpath('//center[@class="suspendbox"]//a')[0].click()

        # if it's our first time, wait 10 seconds
        if firsttime:
            time.sleep(10)
            firsttime = False

        # wait a second to ensure the downlaod is going
        time.sleep(1)

        # close the window
        closed = False
        while not closed:
            try:
                print "trying to close..."
                browser.find_elements_by_xpath('.//img[@title="Close Window"]')[0].click()
                closed = True
            except:
                print "couldn't close...sleeping"
                time.sleep(1)
        browser.switch_to_window(browser.window_handles[0])
    # all_frames = browser.find_elements_by_xpath('//frame')
    # for f in all_frames:
    #     print f.get_attribute('title')
    d = d+delta
	# To run this script, from the command line type:
	# python scholar_selenium.py
	# The results will be stored in a file called results.csv

	# Please make sure you have installed the following:
	# * Python 2.6 or higher (not Python 3)
	# * Latest version of Firefox
	# * Selenium for Python [from the command line, type pip install selenium]

	# If you don't have pip installed, you need to install it:
	# Windows: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows
	# Mac: You should be able to type easy_install pip from the command line


	# You must also set firefox to automatically download files:
	# http://kb.mozillazine.org/File_types_and_download_actions

	# Still to do: write checks for what happens when you have too many results or too few, and adjust the delta time accordingly
	# Add in the other options for searching


	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	import selenium.webdriver.support.ui as ui
	import selenium
	import datetime
	import time
	import re
	import csv

	def isReady(browser):
	return browser.execute_script("return document.readyState")=="complete"

	## Add in what you need here:
	terms = '"sars"'

	## Date to begin
	# year,m,d
	start_date = datetime.date(2003,1,1)
	## Date to end
	# year,m,d
	end_date = datetime.date(2013,1,1)

	browser = webdriver.Firefox()
	wait = ui.WebDriverWait(browser,10)

	firsttime = True

	d = start_date
	delta = datetime.timedelta(days=1)
	while d <= end_date:
	thisurl = 'http://www.lexisnexis.com.ezp-prod1.hul.harvard.edu/hottopics/lnacademic/?shr=t&sfi=AC01NBSimplSrch'
	#loaderr = False
	#while loaderr:
	try:
	browser.get(thisurl)
	except:
	browser.switch_to_window(browser.window_handles[0])
	browser.get(thisurl)
	while not isReady(browser):
	time.sleep(1)

	browser.switch_to_frame("mainFrame")

	dateoptions = browser.find_elements_by_xpath('//*[@id="dateSelector1"]')[0]
	select = ui.Select(dateoptions)
	select.select_by_value("from")

	startbox = browser.find_elements_by_xpath('//*[@id="fromDate1"]')[0]
	startbox.send_keys(d.strftime("%m/%d/%Y"))
	endbox = browser.find_elements_by_xpath('//*[@id="toDate1"]')[0]
	endbox.send_keys((d+delta).strftime("%m/%d/%Y"))

	searchbox = browser.find_elements_by_xpath('.//input[@id="terms"]')[0]
	searchbox.send_keys(terms + Keys.RETURN)
	#fs_main > frame:nth-child(2)
	while not isReady(browser):
	time.sleep(1)

	loaded = False
	while not loaded:
	try:
	browser.switch_to_default_content()
	browser.switch_to_frame(1)
	browser.switch_to_frame(1)
	loaded = True
	except:
	time.sleep(1)
	browser.switch_to_default_content()


	# after the page has loaded
	# get the second frame
	browser.switch_to_default_content()
	browser.switch_to_frame(1)
	# and the second frame within that frame
	browser.switch_to_frame(1)

	# how many documents are there?
	nresults = int(browser.find_elements_by_xpath('//span[@class="l3b paginationalign"]')[0].text.split("of")[1])
	remainingresults = nresults
	lastresult = 0
	dcomplete = False

	while dcomplete == False:
	browser.switch_to_default_content()
	browser.switch_to_frame(1)
	# and the second frame within that frame
	browser.switch_to_frame(1)

	browser.find_elements_by_xpath('.//img[@title="Download Documents"]')[0].click()
	browser.switch_to_window(browser.window_handles[1])
	while not isReady(browser):
	time.sleep(1)

	# then, it pops up
	downloadoptions = browser.find_elements_by_xpath('//*[@id="delFmt"]')[0]
	select = ui.Select(downloadoptions)
	select.select_by_value("QDS_EF_GENERICTYPE")

	if nresults>500:
	print "More than 500 results:", nresults
	rangebox = browser.find_elements_by_xpath('.//input[@id="rangetextbox"]')[0]
	nextresult = min(lastresult+500,nresults)
	rangebox.send_keys("{0}-{1}".format(lastresult+1,nextresult))
	print "clicking rangebox..."
	rangebox.click()
	time.sleep(1)
	lastresult = nextresult
	if lastresult==nresults:
	dcomplete = True
	else:
	dcomplete = True

	browser.find_elements_by_xpath('//img[@title="Download"]')[0].click()

	# then, it takes a while to load the download link

	while not isReady(browser):
	time.sleep(1)

	while (len(browser.find_elements_by_xpath('//center[@class="suspendbox"]'))==0):
	time.sleep(1)

	# we set up to download automatically, otherwise, it won't work.
	browser.find_elements_by_xpath('//center[@class="suspendbox"]//a')[0].click()

	# if it's our first time, wait 10 seconds
	if firsttime:
	time.sleep(10)
	firsttime = False

	# wait a second to ensure the downlaod is going
	time.sleep(1)

	# close the window
	closed = False
	while not closed:
	try:
	print "trying to close..."
	browser.find_elements_by_xpath('.//img[@title="Close Window"]')[0].click()
	closed = True
	except:
	print "couldn't close...sleeping"
	time.sleep(1)
	browser.switch_to_window(browser.window_handles[0])
	# all_frames = browser.find_elements_by_xpath('//frame')
	# for f in all_frames:
	# print f.get_attribute('title')
	d = d+delta