Skip to content

Instantly share code, notes, and snippets.

@ibraizQazi
Last active January 30, 2018 12:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ibraizQazi/bfbe0f944ed700ddc430b97f99445568 to your computer and use it in GitHub Desktop.
Save ibraizQazi/bfbe0f944ed700ddc430b97f99445568 to your computer and use it in GitHub Desktop.
Extract Dosage table of every medicine on site and save it to a csv file.
import re
import sys
import time
import string
import csv
import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
class wait_for_page_load(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def __exit__(self, *_):
self.wait_for(self.page_has_loaded)
def wait_for(self, condition_function):
start_time = time.time()
while time.time() < start_time + 3:
if condition_function():
return True
else:
time.sleep(0.1)
raise Exception(
'Timeout waiting for {}'.format(condition_function.__name__)
)
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def init_driver():
driver = webdriver.Firefox()
driver.wait = WebDriverWait(driver, 5)
return driver
def save_data_csv(results):
""" Save data to csv file """
filename = 'drugsinfo.csv'
with open (filename, 'wb') as f:
w = csv.writer(f)
for result in results:
w.writerow(result)
def parse_rows(rows):
""" Get data from rows """
results = []
for row in rows:
table_data = row.find_all('tr')
if table_data:
results.append([row.get_text() for row in table_data])
table_data = row.find_all('td')
if table_data:
results.append([data.get_text() for data in table_data])
return results
def scrape(driver):
url = "http://www.druginfosys.com/drug.aspx?drugcode=22&type=1"
driver.get(url)
count = 0
soup = BeautifulSoup(driver.page_source, "html.parser")
while (count < 3999):
medicine_name = soup.find('h1')
if (len(medicine_name) > 0):
table = soup.find('table')
rows = table.find_all('tr')
table_data = parse_rows(rows)
for i in table_data:
print '\t'.join(i)
save_data_csv(table_data)
time.sleep(3)
with wait_for_page_load(driver):
driver.find_element_by_xpath('/html/body/div[2]/div/form/div[4]/div/article/center/a[2]').click()
count = count + 1
driver.close()
if __name__ == '__main__':
driver = init_driver()
scrape(driver)
time.sleep(5)
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment