Skip to content

Instantly share code, notes, and snippets.

@bjpcjp
Forked from theriley106/housingScrape.py
Created February 8, 2018 22:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjpcjp/602878059a2db631e5785c7e1cc76a4f to your computer and use it in GitHub Desktop.
Save bjpcjp/602878059a2db631e5785c7e1cc76a4f to your computer and use it in GitHub Desktop.
Scraping Valid Addresses from all US ZipCodes
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
import bs4
import zipcode
import threading
import re
import json
import time
THREADS = 30
DB = {}
completedZips = []
lock = threading.Lock()
url = "https://www.hud.com/listing/search/ui/event?q=29680"
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def genAllZipCodes():
listOfAllZipCodes = []
for i in range(0,11):
for a in zipcode.islike(str(i)):
listOfAllZipCodes.append(a.to_dict()['zip'])
return listOfAllZipCodes
def grabSite(url):
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
return requests.get(url, headers=headers, timeout=15)
def validAddress(address):
if len(re.findall('\d+', str(address))) == 0:
return False
else:
return True
def updateJson(fileName='result.json'):
while True:
time.sleep(30)
lock.acquire()
with open(fileName, 'w') as fp:
json.dump(DB, fp)
lock.release()
print("{} / {} Successfully Completed".format(len(completedZips), initZipVal))
def genThreeHouses(zipCode):
for zipCode in zipCode:
try:
houseList = []
res = grabSite("https://assist2sell.com/homes/{}".format(zipCode))
page = bs4.BeautifulSoup(res.text, 'lxml')
for val in page.select(".col-md-12 a"):
title = val.getText().partition(",")[0].split(" ")
address = ' '.join([t.title() for t in title if len(t) != 0])
if validAddress(address) == True:
houseList.append(address)
if len(houseList) != 0:
lock.acquire()
DB[str(zipCode)] = houseList
lock.release()
completedZips.append(zipCode)
except Exception as exp:
print("{} - {}".format(zipCode, exp))
zipList = genAllZipCodes()
initZipVal = len(zipList)
zipList = chunks(zipList, len(zipList) / THREADS)
if __name__ == '__main__':
threa = threading.Thread(target=updateJson).start()
threads = [threading.Thread(target=genThreeHouses, args=(zipCode,)) for zipCode in zipList]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment