loisaidasam/scour.py

## scour.py

import time

from bs4 import BeautifulSoup
import requests


BASE_URL = "http://newyork.craigslist.org"
URL = "http://newyork.craigslist.org/search/?sort=rel&areaID=3&subAreaID=&query=bicycle&catAbb=sss"

SLEEP_BETWEEN_REQS_SECS = 5

# Set search items to be the lowercased search strings you're looking for
# (for example, these bike frame sizes that I'm looking for)
SEARCH_ITEMS = []
for size in (62, 63, 64):
    SEARCH_ITEMS.append('%s cm' % size)
    SEARCH_ITEMS.append('%scm' % size)


def scour_link(link):
    print "scour_link(%s)" % link
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.content)
    print soup.title.string
    for search_item in SEARCH_ITEMS:
        if search_item in response.content.lower():
            print "\tFOUND %s!"

    print ""
    # print "Sleeping for %s seconds..." % SLEEP_BETWEEN_REQS_SECS
    time.sleep(SLEEP_BETWEEN_REQS_SECS)


def scour():
    response = requests.get(URL)
    response.raise_for_status()

    soup = BeautifulSoup(response.content)

    content = soup.find('div', attrs={'class': 'content'})

    links = set()
    for link in content.find_all('a'):
        url = link.get('href')
        if '.html' in url:
            links.add(url)

    print "Found %s links to scour" % len(links)
    for link in links:
        if not link.startswith('http'):
            link = "%s%s" % (BASE_URL, link)
        scour_link(link)


def main():
    scour()


if __name__ == '__main__':
    main()

	import time

	from bs4 import BeautifulSoup
	import requests


	BASE_URL = "http://newyork.craigslist.org"
	URL = "http://newyork.craigslist.org/search/?sort=rel&areaID=3&subAreaID=&query=bicycle&catAbb=sss"

	SLEEP_BETWEEN_REQS_SECS = 5

	# Set search items to be the lowercased search strings you're looking for
	# (for example, these bike frame sizes that I'm looking for)
	SEARCH_ITEMS = []
	for size in (62, 63, 64):
	SEARCH_ITEMS.append('%s cm' % size)
	SEARCH_ITEMS.append('%scm' % size)


	def scour_link(link):
	print "scour_link(%s)" % link
	response = requests.get(link)
	response.raise_for_status()
	soup = BeautifulSoup(response.content)
	print soup.title.string
	for search_item in SEARCH_ITEMS:
	if search_item in response.content.lower():
	print "\tFOUND %s!"

	print ""
	# print "Sleeping for %s seconds..." % SLEEP_BETWEEN_REQS_SECS
	time.sleep(SLEEP_BETWEEN_REQS_SECS)


	def scour():
	response = requests.get(URL)
	response.raise_for_status()

	soup = BeautifulSoup(response.content)

	content = soup.find('div', attrs={'class': 'content'})

	links = set()
	for link in content.find_all('a'):
	url = link.get('href')
	if '.html' in url:
	links.add(url)

	print "Found %s links to scour" % len(links)
	for link in links:
	if not link.startswith('http'):
	link = "%s%s" % (BASE_URL, link)
	scour_link(link)


	def main():
	scour()


	if __name__ == '__main__':
	main()