Created
October 7, 2015 19:07
-
-
Save nitinbhojwani/e9d972a2996bbdf18d55 to your computer and use it in GitHub Desktop.
Used this python script to scrape data from drugbank.ca
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import csv | |
import sys | |
def fetchResult(z): | |
print("Starting ... "+str(z)) | |
resultList = [] | |
drugnum = str(z) | |
zeros = 5 - len(drugnum) | |
for i in range(0,zeros): | |
drugnum = '0' + drugnum | |
page = requests.get('http://www.drugbank.ca/drugs/DB'+drugnum) | |
tree = html.fromstring(page.text) | |
titles = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/th')] | |
detail = [td.text_content() for td in tree.xpath('//html/body/main/table[1]/tbody/tr[not(@id) and not(@class)]/td')] | |
dictionary = {} | |
dictionary[id] = 'DB'+drugnum | |
for i,title in enumerate(titles): | |
dictionary[title] = detail[i].encode("utf-8") | |
resultList.append(dictionary) | |
keys = resultList[0].keys() | |
outputFile = open(fileName, 'a+') | |
outputFile.write(u'\ufeff'.encode('utf8')) | |
with outputFile as output_file: | |
dict_writer = csv.DictWriter(output_file, keys) | |
# dict_writer.writeheader() | |
dict_writer.writerows(resultList) | |
print(str(z) + 'Done ...') | |
start = int(sys.argv[1]) | |
end = (start/1000 + 1) * 1000 + 1 | |
if (end > 9024): | |
end = 9024 | |
print (start, end) | |
# for j in range(start, 9000, 500): | |
# resultList = [] | |
#9024 | |
# 28, | |
if (start <= 1000): | |
fileName = 'results.csv' | |
else : | |
fileName = 'results'+str(start/1000 + 1)+'.csv' | |
for z in range(start, end): | |
fetchResult(z) | |
# print(str(j)+' Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment