Skip to content

Instantly share code, notes, and snippets.

@ideamonk
Created January 16, 2011 05:26
Show Gist options
  • Save ideamonk/781589 to your computer and use it in GitHub Desktop.
Save ideamonk/781589 to your computer and use it in GitHub Desktop.
finds closest city on craiglist for a given patch location
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
from multiprocessing import Pool, cpu_count
def getMaxUrl(logline):
donkey_kong = logline.split('|')
orig_place = donkey_kong[0]
place = orig_place.split('-')[0]
state,short_form = donkey_kong[-2:]
max_count = 0;
max_url = ""
for state_link in state_dict[state.strip()].findAll('li'):
state_url = state_link.find('a')['href']
state_soup = BeautifulSoup(urllib2.urlopen(state_url + "/search/?query=Location:\"%s\"&catAbb=sss" %(place.replace(' ','+'))).read())
hints = str(state_soup.find('h4', {'class':'ban'}))
if hints.find("Found:") != -1:
count = int(hints.split('Found:')[1].split("Displaying")[0].strip())
if (count>max_count):
max_count = count
max_url = state_url
print place, max_url
return (place, orig_place,max_url)
if __name__=='__main__':
soup = BeautifulSoup(urllib2.urlopen("http://www.craigslist.org/about/sites").read())
state_dict = {}
for state in soup.findAll("div", { 'class': 'colmask' } )[0].findAll("div", {'class':'state_delimiter'}):
state_dict[state.contents[0]] = state.findNext('ul')
f = open ("patchsites.txt")
o = open ("patch_links.txt", "w+")
o.write ("{")
lines = []
l = f.readline()
while l:
lines.append(l)
l=f.readline()
pool = Pool(processes=20)
results = [pool.apply_async(getMaxUrl, (a,)) for a in lines]
results = [r.get() for r in results]
for city,o_city, url in results:
o.write("\"%s\" : [\"%s\",\"%s\"]," % (city,o_city,url))
o.write("}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment