Skip to content

Instantly share code, notes, and snippets.

@nitinbhojwani
Created October 7, 2015 19:07
Show Gist options
  • Save nitinbhojwani/e9d972a2996bbdf18d55 to your computer and use it in GitHub Desktop.
Save nitinbhojwani/e9d972a2996bbdf18d55 to your computer and use it in GitHub Desktop.
Used this python script to scrape data from drugbank.ca
from lxml import html
import requests
import csv
import sys
def fetchResult(z):
print("Starting ... "+str(z))
resultList = []
drugnum = str(z)
zeros = 5 - len(drugnum)
for i in range(0,zeros):
drugnum = '0' + drugnum
page = requests.get('http://www.drugbank.ca/drugs/DB'+drugnum)
tree = html.fromstring(page.text)
titles = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/th')]
detail = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/td')]
dictionary = {}
dictionary[id] = 'DB'+drugnum
for i,title in enumerate(titles):
dictionary[title] = detail[i].encode("utf-8")
resultList.append(dictionary)
keys = resultList[0].keys()
outputFile = open(fileName, 'a+')
outputFile.write(u'\ufeff'.encode('utf8'))
with outputFile as output_file:
dict_writer = csv.DictWriter(output_file, keys)
# dict_writer.writeheader()
dict_writer.writerows(resultList)
print(str(z) + 'Done ...')
start = int(sys.argv[1])
end = (start/1000 + 1) * 1000 + 1
if (end > 9024):
end = 9024
print (start, end)
# for j in range(start, 9000, 500):
# resultList = []
#9024
# 28,
if (start <= 1000):
fileName = 'results.csv'
else :
fileName = 'results'+str(start/1000 + 1)+'.csv'
for z in range(start, end):
fetchResult(z)
# print(str(j)+' Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment