kartikv11/doctors_scraping.py

## doctors_scraping.py
import requests
from bs4 import BeautifulSoup
import sys
import json
import threading
import re
import time
import os

proxies = {
            'http': '<give http proxy here>',
            'https': '<give https proxy here>'
          }

class writeDocDetailThread (threading.Thread):
    def __init__(self, index, limit):
        super(writeDocDetailThread, self).__init__()
        self.index = index
        self.limit = limit
    def run(self):
        try:
            log_doc_details(self.index, self.limit)
        except:
            print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))


class doctorsListThread (threading.Thread):
    def __init__(self, startingIndex):
        super(doctorsListThread, self).__init__()
        self.startingIndex = startingIndex
    def run(self):
        try:
            get_doctors_detail(self.startingIndex)
        except:
            print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))


def get_doctors_detail(startingIndex):
    if startingIndex != 1:
        startingLimit = (startingIndex - 1) * 1000
    else:
        startingLimit = 1
    upperLimit = startingIndex * 1000
    try:
        for i in range(startingLimit, upperLimit + 1):
            writeDocDetail = writeDocDetailThread(i, upperLimit)
            writeDocDetail.run()
            print "Logging: {} , Upper: {}".format(i, upperLimit)
    except:
        print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__),
                                                        str(sys.exc_info()[2].tb_lineno)))

def log_doc_details(index, limit):
    url_to_scrape = 'https://old.mciindia.org/ViewDetails.aspx?ID={}'.format(index)
    r = requests.get(url_to_scrape,proxies=proxies)
    soup = BeautifulSoup(r.text, "html.parser")
    doc_detailed_data = {}
    try:
        doc_detailed_data['Name'] = soup.find("span", {"id": "Name"}).text
        doc_detailed_data['FatherName'] = soup.find("span", {"id": "FatherName"}).text
        doc_detailed_data['YearOfInfo'] = soup.find("span", {"id": "lbl_Info"}).text
        doc_detailed_data['RegistrationNo'] = soup.find("span", {"id": "Regis_no"}).text
        doc_detailed_data['DateOfReg'] = soup.find("span", {"id": "Date_Reg"}).text
        doc_detailed_data['StateMedicalCouncil'] = soup.find("span", {"id": "Lbl_Council"}).text
        doc_detailed_data['Qualification'] = soup.find("span", {"id": "Qual"}).text
        doc_detailed_data['QualificationYear'] = soup.find("span", {"id": "QualYear"}).text
        doc_detailed_data['University'] = soup.find("span", {"id": "Univ"}).text
        doc_detailed_data['Address'] = soup.find("span", {"id": "Address"}).text
    except:
        pass
    fileName = "doc_details_{}".format(limit)
    with open('/home/ubuntu/doctors_data/raw_files/{}.txt'.format(fileName),'a') as f:
        f.write(json.dumps(doc_detailed_data))

if __name__ == "__main__":

    count = 0
    for i in range(1, 1001):
        count += 1
        print "Iteration: {}x1000entries (1000 iterations)".format(count)
        if count%50 == 0:
            time.sleep(10)
        doctorsListFetch = doctorsListThread(i)
        doctorsListFetch.start()
	import requests
	from bs4 import BeautifulSoup
	import sys
	import json
	import threading
	import re
	import time
	import os

	proxies = {
	'http': '<give http proxy here>',
	'https': '<give https proxy here>'
	}

	class writeDocDetailThread (threading.Thread):
	def __init__(self, index, limit):
	super(writeDocDetailThread, self).__init__()
	self.index = index
	self.limit = limit
	def run(self):
	try:
	log_doc_details(self.index, self.limit)
	except:
	print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))


	class doctorsListThread (threading.Thread):
	def __init__(self, startingIndex):
	super(doctorsListThread, self).__init__()
	self.startingIndex = startingIndex
	def run(self):
	try:
	get_doctors_detail(self.startingIndex)
	except:
	print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__), str(sys.exc_info()[2].tb_lineno)))


	def get_doctors_detail(startingIndex):
	if startingIndex != 1:
	startingLimit = (startingIndex - 1) * 1000
	else:
	startingLimit = 1
	upperLimit = startingIndex * 1000
	try:
	for i in range(startingLimit, upperLimit + 1):
	writeDocDetail = writeDocDetailThread(i, upperLimit)
	writeDocDetail.run()
	print "Logging: {} , Upper: {}".format(i, upperLimit)
	except:
	print("Unexpected error : {:s} at {:s}.".format(str(sys.exc_info()[0].__name__),
	str(sys.exc_info()[2].tb_lineno)))

	def log_doc_details(index, limit):
	url_to_scrape = 'https://old.mciindia.org/ViewDetails.aspx?ID={}'.format(index)
	r = requests.get(url_to_scrape,proxies=proxies)
	soup = BeautifulSoup(r.text, "html.parser")
	doc_detailed_data = {}
	try:
	doc_detailed_data['Name'] = soup.find("span", {"id": "Name"}).text
	doc_detailed_data['FatherName'] = soup.find("span", {"id": "FatherName"}).text
	doc_detailed_data['YearOfInfo'] = soup.find("span", {"id": "lbl_Info"}).text
	doc_detailed_data['RegistrationNo'] = soup.find("span", {"id": "Regis_no"}).text
	doc_detailed_data['DateOfReg'] = soup.find("span", {"id": "Date_Reg"}).text
	doc_detailed_data['StateMedicalCouncil'] = soup.find("span", {"id": "Lbl_Council"}).text
	doc_detailed_data['Qualification'] = soup.find("span", {"id": "Qual"}).text
	doc_detailed_data['QualificationYear'] = soup.find("span", {"id": "QualYear"}).text
	doc_detailed_data['University'] = soup.find("span", {"id": "Univ"}).text
	doc_detailed_data['Address'] = soup.find("span", {"id": "Address"}).text
	except:
	pass
	fileName = "doc_details_{}".format(limit)
	with open('/home/ubuntu/doctors_data/raw_files/{}.txt'.format(fileName),'a') as f:
	f.write(json.dumps(doc_detailed_data))

	if __name__ == "__main__":

	count = 0
	for i in range(1, 1001):
	count += 1
	print "Iteration: {}x1000entries (1000 iterations)".format(count)
	if count%50 == 0:
	time.sleep(10)
	doctorsListFetch = doctorsListThread(i)
	doctorsListFetch.start()