Created
September 21, 2017 08:04
-
-
Save shafayeatsumit/d3d982265c16af63810e4ad522b1a3ec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import time | |
import re | |
import math | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) | |
#driver = webdriver.Firefox() | |
root_url = "https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea" | |
driver.get(root_url) | |
def find_number_of_items(driver): | |
item_number_list = [] | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("page count%s"%len(page1_rows)) | |
for row in range(len(page1_rows)): | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
name = page1_rows[row].text | |
page1_rows[row].click() | |
time.sleep(10) | |
print ("sleeping") | |
page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text | |
page_count = int(re.search(r'\d+',page_count).group()) | |
driver.get(root_url) | |
item_number_list.append(page_count) | |
print (item_number_list) | |
return item_number_list | |
#lis = find_number_of_items(driver) | |
lis = [7, 25, 13, 3, 17, 2, 8, 17, 12, 17, 4, 38, 24, 7, 264, 16, 4, 0, 5, 5, 10, 3, 5, 9, 4] | |
for val in lis: | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
#for row in range(val): | |
for row in range(val): | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("row number",row) | |
#this would handle the pagination | |
if ((row/9.0)>1): | |
print ("inside if") | |
click_nxtpage_count = int(math.floor(row/9)) | |
print ("nxt page count",click_nxtpage_count) | |
for i in range(click_nxtpage_count): | |
driver.find_element_by_xpath("//*[@id='page']/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[contains(text(), '[next]')]").click() | |
time.sleep(10) | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
row = row%9 | |
page2_rows[row].click() | |
time.sleep(10) | |
#extract data here | |
driver.get(root_url) | |
if val-1 == row: | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
else: | |
print ("inside else") | |
page2_rows[row].click() | |
time.sleep(10) | |
driver.get(root_url) | |
if val-1 == row: | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
print ("scraped amount",row) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment