nitinbhojwani/DrugbankScrapper.py

## DrugbankScrapper.py
from lxml import html
import requests
import csv
import sys


def fetchResult(z):
	print("Starting ... "+str(z))
	resultList = []
	drugnum = str(z)
	zeros = 5 - len(drugnum)
	for i in range(0,zeros):
		drugnum = '0' + drugnum
	page = requests.get('http://www.drugbank.ca/drugs/DB'+drugnum)
	tree = html.fromstring(page.text)

	titles = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/th')]
	detail = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/td')]

	dictionary = {}
	dictionary[id] = 'DB'+drugnum
	for i,title in enumerate(titles):
		dictionary[title] = detail[i].encode("utf-8")
	resultList.append(dictionary)

	keys = resultList[0].keys()
	outputFile = open(fileName, 'a+')
	outputFile.write(u'\ufeff'.encode('utf8'))
	with outputFile as output_file:
	    dict_writer = csv.DictWriter(output_file, keys)
	    # dict_writer.writeheader()
	    dict_writer.writerows(resultList)
	print(str(z) + 'Done ...')

start = int(sys.argv[1])
end = (start/1000 + 1) * 1000 + 1

if (end > 9024):
	end = 9024

print (start, end)
# for j in range(start, 9000, 500):
	# resultList = []
	#9024
	# 28,
if (start <= 1000):
	fileName = 'results.csv'
else :
	fileName = 'results'+str(start/1000 + 1)+'.csv'

for z in range(start, end):
	fetchResult(z)
# print(str(j)+' Done')
	from lxml import html
	import requests
	import csv
	import sys


	def fetchResult(z):
	print("Starting ... "+str(z))
	resultList = []
	drugnum = str(z)
	zeros = 5 - len(drugnum)
	for i in range(0,zeros):
	drugnum = '0' + drugnum
	page = requests.get('http://www.drugbank.ca/drugs/DB'+drugnum)
	tree = html.fromstring(page.text)

	titles = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/th')]
	detail = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/td')]

	dictionary = {}
	dictionary[id] = 'DB'+drugnum
	for i,title in enumerate(titles):
	dictionary[title] = detail[i].encode("utf-8")
	resultList.append(dictionary)

	keys = resultList[0].keys()
	outputFile = open(fileName, 'a+')
	outputFile.write(u'\ufeff'.encode('utf8'))
	with outputFile as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	# dict_writer.writeheader()
	dict_writer.writerows(resultList)
	print(str(z) + 'Done ...')

	start = int(sys.argv[1])
	end = (start/1000 + 1) * 1000 + 1

	if (end > 9024):
	end = 9024

	print (start, end)
	# for j in range(start, 9000, 500):
	# resultList = []
	#9024
	# 28,
	if (start <= 1000):
	fileName = 'results.csv'
	else :
	fileName = 'results'+str(start/1000 + 1)+'.csv'

	for z in range(start, end):
	fetchResult(z)
	# print(str(j)+' Done')