Skip to content

Instantly share code, notes, and snippets.

@apanimesh061
Created July 10, 2017 01:49
Show Gist options
  • Save apanimesh061/150807e9776efb0346ddb02fae4c89c2 to your computer and use it in GitHub Desktop.
Save apanimesh061/150807e9776efb0346ddb02fae4c89c2 to your computer and use it in GitHub Desktop.
Generate Random Addresses
# coding=utf-8
"""
This script crawls the street-view web-page and collects all
of the states, counties per state, cities in every county and
streets in every city.
I created a separate script that generates an apartment number
as per three formats. See GetRandomAddress.py
"""
from bs4 import BeautifulSoup
import urllib2
import urlparse
from urlparse import urljoin
import time
from datetime import datetime
from unidecode import unidecode
import json
class Stack:
"""
Data Structure used to performs a DFS crawl on a website.
"""
def __init__(self):
self.items = []
def is_empty(self):
return self.items == []
def push(self, item):
self.items.append(item)
def pop(self):
return self.items.pop()
def peek(self):
return self.items[len(self.items) - 1]
def size(self):
return len(self.items)
def __str__(self):
return str(self.items)
def remove_non_ascii(text):
"""
Removes non-ascii characters
:param text: input text
:return:
"""
return unidecode(unicode(text, encoding="utf-8"))
def normalize(text):
"""
Normalizes non-ascii characters
:param text: input text
:return:
"""
try:
text = remove_non_ascii(text.encode("utf-8"))
except UnicodeDecodeError:
text = remove_non_ascii(text)
return text
def get_neighbor_links(current_url):
"""
Get out-links for the current_url
:param current_url: input url
:return:
"""
response = None
try:
time.sleep(1)
response = urllib2.urlopen(current_url)
except Exception as exp:
print "Failed URL: {0}".format(current_url)
print "Message:", exp.message
print "Taking some rest..."
time.sleep(8.5)
attempt = 1
while True:
print "Retrying URL: {0}".format(current_url)
try:
response = urllib2.urlopen(current_url)
break
except:
if attempt == 5:
print "Rejecting URL: {0}".format(current_url)
yield None
else:
attempt += 1
time.sleep(1)
continue
web_page = response.read()
soup = BeautifulSoup(web_page, "lxml")
for a in soup.find_all('a', href=True):
abs_url = urljoin(current_url, a["href"])
# Check if you are accessing the OpenStreetMap.
# If you are accessing the view page of the StreetView
# this is the leaf node of the DFS traversal.
if "view.php" in abs_url:
abs_url = abs_url.replace('&', 'and')
parsed = urlparse.urlparse(abs_url)
place = urlparse.parse_qs(parsed.query)['place']
yield {"LOCATION": place}
goto_url_without_base = current_url.rsplit('/', 1)[0] + '/'
if goto_url_without_base in abs_url:
yield {"URL": abs_url}
def process_address(address):
"""
Covert the address string from the streetview page to a dictionary.
:param address: address string
:return: address dict
"""
parts = map(lambda x: x.strip(), address.split(','))
# Parsing rules
street = parts[0]
country = parts[-1]
zipcode = parts[-2]
state = parts[-3]
if state == "D.C.":
state = parts[-4] + " " + state
county = ""
city = ""
if (len(parts)) > 5:
county = parts[-4]
city = parts[-5]
yield {
"STREET": street,
"CITY": city,
"COUNTY": county,
"STATE": state,
"ZIP": zipcode,
"COUNTRY": country
}
if __name__ == '__main__':
out_file = None
try:
out_file = open("address.dat", "w")
startTime = datetime.now()
seed = "http://www.geographic.org/streetview/usa/index.html"
stack = Stack()
stack.push(seed)
state_being_covered = None
valid_leaves = 0
visited = set()
while not stack.is_empty():
current_url = stack.pop()
if current_url not in visited:
visited.add(current_url)
for data in get_neighbor_links(current_url=current_url):
if data:
url = data.get("URL", None)
if url:
stack.push(url)
else:
final_address = data.get("LOCATION", None)
if final_address:
address_json = next(process_address(final_address[0]))
valid_leaves += 1
if address_json["STATE"] != state_being_covered:
if state_being_covered:
print "Crawled", valid_leaves, "links..."
print "\n"
state_being_covered = address_json["STATE"]
print "Started with", state_being_covered
valid_leaves = 0
json.dump(address_json, out_file)
out_file.write("\n")
break
totalTime = datetime.now() - startTime
print "Total Time Taken:", totalTime, "Units"
except KeyboardInterrupt as kbi:
print "Crawl interrupted!"
print "Info stored in out_file.dat"
finally:
out_file.close()
import random
import json
def random_address(obj):
"""
Returns a random address from the file which stores all possible street address
collected from the crawling http://www.geographic.org/streetview/usa
:param obj: file object that stores addresses
:return: a random line from the file as a dictionary
{u'CITY': u'Harrietta', 'APARTMENT': '661-9178', u'ZIP': u'49638', u'COUNTRY': u'United States',
u'COUNTY': u'Wexford', u'STATE': u'Michigan', u'STREET': u'N 1 1/4 Road'}
"""
def random_with_n_digits(n):
range_start = 10 ** (n - 1)
range_end = (10 ** n) - 1
return random.randint(range_start, range_end)
def generate_apartment_number():
"""
Generates three formats of apartment numbers:
1. {0-90-90-9}-{0-90-90-90-9}
2. {0-90-9A-Z}
3. {#0-9|0-90-9|0-90-90-9}
:return: an string representing an apt. number
"""
apt_type_flag = random.randint(1, 3)
if apt_type_flag == 1:
return str(random_with_n_digits(3)) + '-' + str(random_with_n_digits(4))
elif apt_type_flag == 2:
return str(random_with_n_digits(2)) + chr(random.randint(65, 90))
elif apt_type_flag == 3:
return "#" + str(random_with_n_digits(random.randint(1, 3)))
line = next(obj)
for num, aline in enumerate(obj):
if random.randrange(num + 2):
continue
line = aline
address_json = json.loads(line)
address_json["APARTMENT"] = generate_apartment_number()
return address_json
if __name__ == '__main__':
# Download the addresses.dat from https://www.dropbox.com/s/gl9xcq7f0lsv72d/addresses.dat?dl=0
# Or you could create one yourself using CrawlStreetView.py
file_obj = open("addresses.dat", "rb")
print random_address(obj=file_obj)
file_obj.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment