Skip to content

Instantly share code, notes, and snippets.

@kartikv11
Last active August 11, 2017 10:25
Show Gist options
  • Save kartikv11/5a72d06e3bbd7e6b4ad3bbd36939d9ba to your computer and use it in GitHub Desktop.
Save kartikv11/5a72d06e3bbd7e6b4ad3bbd36939d9ba to your computer and use it in GitHub Desktop.
Example of parsing/scraping data[with Proxy] for All Doctors using BeautifulSoup. Source used here: https://old.mciindia.org [NOTE: Please control Threads as per your Proxy Restrictions]
import requests
from bs4 import BeautifulSoup
import sys
import json
import threading
import re
import time
import os
proxies = {
'http': '<give http proxy here>',
'https': '<give https proxy here>'
}
class writeDocDetailThread (threading.Thread):
def __init__(self, index, limit):
super(writeDocDetailThread, self).__init__()
self.index = index
self.limit = limit
def run(self):
try:
log_doc_details(self.index, self.limit)
except:
print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))
class doctorsListThread (threading.Thread):
def __init__(self, startingIndex):
super(doctorsListThread, self).__init__()
self.startingIndex = startingIndex
def run(self):
try:
get_doctors_detail(self.startingIndex)
except:
print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))
def get_doctors_detail(startingIndex):
if startingIndex != 1:
startingLimit = (startingIndex - 1) * 1000
else:
startingLimit = 1
upperLimit = startingIndex * 1000
try:
for i in range(startingLimit, upperLimit + 1):
writeDocDetail = writeDocDetailThread(i, upperLimit)
writeDocDetail.run()
print "Logging: {} , Upper: {}".format(i, upperLimit)
except:
print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__),
str(sys.exc_info()[2].tb_lineno)))
def log_doc_details(index, limit):
url_to_scrape = 'https://old.mciindia.org/ViewDetails.aspx?ID={}'.format(index)
r = requests.get(url_to_scrape,proxies=proxies)
soup = BeautifulSoup(r.text, "html.parser")
doc_detailed_data = {}
try:
doc_detailed_data['Name'] = soup.find("span", {"id": "Name"}).text
doc_detailed_data['FatherName'] = soup.find("span", {"id": "FatherName"}).text
doc_detailed_data['YearOfInfo'] = soup.find("span", {"id": "lbl_Info"}).text
doc_detailed_data['RegistrationNo'] = soup.find("span", {"id": "Regis_no"}).text
doc_detailed_data['DateOfReg'] = soup.find("span", {"id": "Date_Reg"}).text
doc_detailed_data['StateMedicalCouncil'] = soup.find("span", {"id": "Lbl_Council"}).text
doc_detailed_data['Qualification'] = soup.find("span", {"id": "Qual"}).text
doc_detailed_data['QualificationYear'] = soup.find("span", {"id": "QualYear"}).text
doc_detailed_data['University'] = soup.find("span", {"id": "Univ"}).text
doc_detailed_data['Address'] = soup.find("span", {"id": "Address"}).text
except:
pass
fileName = "doc_details_{}".format(limit)
with open('/home/ubuntu/doctors_data/raw_files/{}.txt'.format(fileName),'a') as f:
f.write(json.dumps(doc_detailed_data))
if __name__ == "__main__":
count = 0
for i in range(1, 1001):
count += 1
print "Iteration: {}x1000entries (1000 iterations)".format(count)
if count%50 == 0:
time.sleep(10)
doctorsListFetch = doctorsListThread(i)
doctorsListFetch.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment