Last active
June 16, 2023 14:30
-
-
Save loisaidasam/8734712 to your computer and use it in GitHub Desktop.
Hack of the day: How to scour craigslist for a 62cm-64cm bicycle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from bs4 import BeautifulSoup | |
import requests | |
BASE_URL = "http://newyork.craigslist.org" | |
URL = "http://newyork.craigslist.org/search/?sort=rel&areaID=3&subAreaID=&query=bicycle&catAbb=sss" | |
SLEEP_BETWEEN_REQS_SECS = 5 | |
# Set search items to be the lowercased search strings you're looking for | |
# (for example, these bike frame sizes that I'm looking for) | |
SEARCH_ITEMS = [] | |
for size in (62, 63, 64): | |
SEARCH_ITEMS.append('%s cm' % size) | |
SEARCH_ITEMS.append('%scm' % size) | |
def scour_link(link): | |
print "scour_link(%s)" % link | |
response = requests.get(link) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content) | |
print soup.title.string | |
for search_item in SEARCH_ITEMS: | |
if search_item in response.content.lower(): | |
print "\tFOUND %s!" | |
print "" | |
# print "Sleeping for %s seconds..." % SLEEP_BETWEEN_REQS_SECS | |
time.sleep(SLEEP_BETWEEN_REQS_SECS) | |
def scour(): | |
response = requests.get(URL) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content) | |
content = soup.find('div', attrs={'class': 'content'}) | |
links = set() | |
for link in content.find_all('a'): | |
url = link.get('href') | |
if '.html' in url: | |
links.add(url) | |
print "Found %s links to scour" % len(links) | |
for link in links: | |
if not link.startswith('http'): | |
link = "%s%s" % (BASE_URL, link) | |
scour_link(link) | |
def main(): | |
scour() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment