Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Last active March 28, 2018 12:36
Show Gist options
  • Save jaklinger/cf395a1e06228470bcef28f56de9ca48 to your computer and use it in GitHub Desktop.
Save jaklinger/cf395a1e06228470bcef28f56de9ca48 to your computer and use it in GitHub Desktop.
Example of scraping in JS-redirected iframe
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import os
import time
def wait_and_find(driver, element_id, load_time):
time.sleep(load_time/3)
by = (By.ID, element_id)
condition = expected_conditions.presence_of_element_located(by)
WebDriverWait(driver, load_time).until(condition)
return driver.find_element_by_id(element_id)
def get_grant_info_div(driver, url, iframe_id,
tab_to_click, div_id,
keyword, load_time=1.5):
driver.get(url)
driver.switch_to_frame(wait_and_find(driver, iframe_id, load_time))
wait_and_find(driver, tab_to_click, load_time).click()
div = wait_and_find(driver, div_id, load_time)
html = div.get_attribute("innerHTML")
assert keyword in html, "'{}' not found in iframe".format(keyword)
return html
if __name__ == "__main__":
# Set PATH so that Selenium picks up the local chromedriver
os.environ["PATH"] = os.environ["PWD"] + os.pathsep + os.environ["PATH"]
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome("chromedriver", chrome_options=chrome_options)
# Iterate through URLs and get the iframe source
iframe_id = "embeddedIframe"
tab_to_click = "synopsisDetailsTab"
div_id = "synopsisDetailsContent"
keyword = "Funding Opportunity Number"
max_load_time = 10
min_load_time = 1
urls = ["https://www.grants.gov/view-opportunity.html?oppId=39707"]*10
for url in urls:
load_time = min_load_time
html = None
while html is None:
try:
html = get_grant_info_div(driver, url, iframe_id,
tab_to_click, div_id,
keyword, load_time=load_time)
#print(html)
# Do whatever you want with the HTML source
except Exception as err:
print("ERROR ({})".format(url))
load_time += 1
driver.quit()
assert load_time <= max_load_time, "Maximum retries exceeded"
driver = webdriver.Chrome("chromedriver",
chrome_options=chrome_options)
# Cleanly exit
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment