Skip to content

Instantly share code, notes, and snippets.

@johnfink8
Created March 27, 2012 01:01
Show Gist options
  • Save johnfink8/2211339 to your computer and use it in GitHub Desktop.
Save johnfink8/2211339 to your computer and use it in GitHub Desktop.
Simple Django one-off view that pulls down a listing of Craigslist ads from all Craigslist domains. Ain't exactly pretty, but it works. It's not fast by web standards, but it sure beats searching manually.
import urllib2,os,cookielib
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
from django.core.cache import cache
from django.http import HttpResponse
import multiprocessing
m=multiprocessing.Manager()
links=m.list()
hosts=m.Queue()
COOKIEFILEBASE = '/tmp/cookie.tmp.'
#Fetch a url. Simple enough, maybe. But we also check against a cache
#backend, if available. And we honor cookies. Cache the results for 12
#hours, so we don't smear anyone's servers unnecessarily, because we're
#not looking for any time-sensitive data.
def fetch_url(request,url):
server_name=urlparse(url).netloc
COOKIEFILE=COOKIEFILEBASE + server_name
headers={
'Host':server_name,
'User-Agent':request.META['HTTP_USER_AGENT']
}
cache_key = url
if cache.get(cache_key):
return cache.get(cache_key)
cj = cookielib.LWPCookieJar()
urlopen = urllib2.urlopen
Request = urllib2.Request
if os.path.isfile(COOKIEFILE):
cj.load(COOKIEFILE)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
req=Request(url,None,headers)
cj.save(COOKIEFILE)
response=urlopen(req).read()
cache.set(cache_key,response,60*60*12)
return response
#Called for each process created in the view function.
#Pulls a queued hostname, runs a standardized Craigslist search on that
#hostname, and adds an HTML anchor to the list of output in links.
#This particular Craigslist search is for 'django' with the
#'telecommuting' flag set.
def worker_thread(request):
while True:
try:
host=hosts.get_nowait()
except:
print "thread done"
return
url='http://%s/search/jjj?query=django&srchType=A&addOne=telecommuting'%(host,)
html=fetch_url(request,url)
try:
soup=BeautifulSoup(html)
except:
print html
for p in soup.findAll('blockquote')[-1].findAll('p'):
links.append('<a href="%s">%s - %s</a>'%(p.a['href'],p.text,host))
hosts.task_done()
print host
#Actual view that is mapped to a url.
#Builds a list of sites based on the craigslist front page.
#Pushes them all into a queue, and starts off 10 processes working
#that queue
def find_all_postings(request):
url='http://www.craigslist.org/about/sites'
soup=BeautifulSoup(fetch_url(request,url))
for li in soup.find(attrs={'class':'colmask'}).findAll('li'):
hosts.put(urlparse(li.a['href']).netloc)
for index in range(0,10):
#If you'll notice, these two lines (other than some variables) are
#the only two we need to make this a multi-process app
thread=multiprocessing.Process(target=worker_thread, args=(request,))
thread.start()
hosts.join()
sorted_links = sorted(links, key=lambda x: BeautifulSoup(x).text)
return HttpResponse('<br/>'.join(sorted_links))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment