Skip to content

Instantly share code, notes, and snippets.

@charanjit-singh
Created March 10, 2021 13:54
Show Gist options
  • Save charanjit-singh/44b792fd92c4ab3f709422589a7f08a9 to your computer and use it in GitHub Desktop.
Save charanjit-singh/44b792fd92c4ab3f709422589a7f08a9 to your computer and use it in GitHub Desktop.
Script to scrape RERA Information from Punjab RERA Website ( using multithreading)
import requests
from bs4 import BeautifulSoup
import codecs
import threading
page = ""
f = codecs.open("RERA_PUNJAB_LIST.html","r")
page = f.read()
soup = BeautifulSoup(page, 'html.parser')
BROKERS = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
sr = tds[0].get_text()
name = tds[1].get_text()
district = tds[2].get_text()
rera = tds[3].get_text()
registration_valid_upto = tds[4].get_text()
anchors = tr.find_all("a")
inputs = tr.find_all("input")
type_ = anchors[0].get('id')
agent_id = inputs[0].get("value")
BROKERS.append({
"Sr. No.":sr.strip(),
"Name":name.strip(),
"District": district.strip(),
"RERA No.":rera.strip(),
"Registration Valid Upto": registration_valid_upto,
"Offline": 1 if type_ in ['modalOpenerOfflineRegisteredButton',] else 0,
"Agent ID": agent_id
})
import csv
csv_file = "PUNJAB_RERA_LIST.csv"
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=BROKERS[0].keys())
writer.writeheader()
for data in BROKERS:
writer.writerow(data)
except IOError:
print("I/O error")
import re
import time
class OnlineThread(threading.Thread):
def __init__(self, name,):
threading.Thread.__init__(self)
self.name = name
def run(self):
ONLINE_BROKERS = []
ONLINE_KEYS = list(BROKERS[0].keys())
# ONLINE Registered CASE
ONLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 0, BROKERS))
ONLINE_BROKERS_RAW_CNT = len(ONLINE_BROKERS_RAW)
ONLINE_BROKERS_LOOP_COUNTER = 0
FAILED_ONLINES = []
for broker in ONLINE_BROKERS_RAW:
AGENT_ID = broker['Agent ID']
ONLINE_BROKERS_LOOP_COUNTER += 1
print("Processing Online Broker:", ONLINE_BROKERS_LOOP_COUNTER,'/',ONLINE_BROKERS_RAW_CNT)
URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewDetails?inAgent_ID="+AGENT_ID
try:
broker_page = requests.get(URL)
except Exception as e:
FAILED_ONLINES.append(broker)
print("Failed Online Broker", broker, "Reason:", str(e))
time.sleep(5)
continue
content = broker_page.content
broker_soup = BeautifulSoup(content, "html.parser")
tds = broker_soup.find_all("td")
for i in range(len(tds)-1):
current_td = tds[i]
next_td = tds[i + 1]
if "single-detail" in current_td.get('class') if current_td.get("class") else False:
key = current_td.get_text().strip()
if key not in ONLINE_KEYS:
ONLINE_KEYS.append(key)
value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," "))
broker[key] = value
ONLINE_BROKERS.append(broker)
csv_file = "ONLINE_PUNJAB_RERA_LIST.csv"
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=ONLINE_KEYS)
writer.writeheader()
for data in ONLINE_BROKERS:
writer.writerow(data)
except IOError:
print("I/O error")
print("FAILED ONLINE BROKERS", FAILED_ONLINES)
class OfflineThread(threading.Thread):
def __init__(self, name,):
threading.Thread.__init__(self)
self.name = name
def run(self):
OFFLINE_BROKERS = []
OFFLINE_KEYS = list(BROKERS[0].keys())
# OFFLINE Registered CASE
OFFLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 1, BROKERS))
OFFLINE_BROKERS_RAW_CNT = len(OFFLINE_BROKERS_RAW)
OFFLINE_BROKERS_LOOP_COUNTER = 0
FAILED_OFFLINES = []
for broker in OFFLINE_BROKERS_RAW:
AGENT_ID = broker['Agent ID']
OFFLINE_BROKERS_LOOP_COUNTER += 1
URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewOfflineRegisteredDetails?inAgent_ID="+AGENT_ID
print("Processing Offline Broker:", OFFLINE_BROKERS_LOOP_COUNTER,'/',OFFLINE_BROKERS_RAW_CNT, "URL:",URL)
try:
broker_page = requests.get(URL)
except Exception as e:
print("Failed offline Broker", broker, "Reason:", str(e))
FAILED_OFFLINES.append(broker)
continue
content = broker_page.content
broker_soup = BeautifulSoup(content, "html.parser")
tds = broker_soup.find_all("td")
for i in range(len(tds)-1):
current_td = tds[i]
next_td = tds[i + 1]
if "single-detail" in current_td.get('class') if current_td.get("class") else False:
key = current_td.get_text().strip()
if key not in OFFLINE_KEYS:
OFFLINE_KEYS.append(key)
value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," "))
broker[key] = value
OFFLINE_BROKERS.append(broker)
csv_file = "OFFLINE_PUNJAB_RERA_LIST.csv"
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=OFFLINE_KEYS)
writer.writeheader()
for data in OFFLINE_BROKERS:
writer.writerow(data)
except IOError:
print("I/O error")
print("FAILED OFFLINE BROKERS", FAILED_OFFLINES)
onlineThread = OnlineThread("Online")
offlineThread = OfflineThread("Offline")
onlineThread.start()
offlineThread.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment