Skip to content

Instantly share code, notes, and snippets.

@tswicegood
Last active November 17, 2021 23:22
Show Gist options
  • Save tswicegood/90ab3e4915d23c312089 to your computer and use it in GitHub Desktop.
Save tswicegood/90ab3e4915d23c312089 to your computer and use it in GitHub Desktop.
Simple scraper for looking through a bunch of saved Craigslist listings for a phone number
# Conda environment file -- these packages are required for this script to work
name: craigslist
dependencies:
- lxml
- pip
- python=3.4*
- requests
- pip:
- aiohttp
- pyquery
"""
This script lets you scan all of the listings for bikes on a collection of
Craiglist postings to look for matching phone numbers. Note, this should be
used at your own risk, et cetra, and so on.
To use this, create a directory that contains saved files from Craigslist's
main listing (or search results) for the bike category. The default is to
use ~/Desktop/bikes/.
Once you've gathered the necessary saved data, you can run this script with
the following command:
PHONE_NUMBER=5553334444 python scraper.py
Replace the value for PHONE_NUMBER with whatever you are searching for. I had
a situation where I knew a stolen bike had been listed with a given phone
number, so that's what this searches. The script could be modified to look at
other values as well.
"""
import aiohttp
import asyncio
import os
import sys
from pyquery import PyQuery as pq
sem = asyncio.Semaphore(2)
"""
## Configuration
These values can be configured via environment variables. It's required that
you include PHONE_NUMBER, but other values include:
* CURRENT_CITY -- the name of the Craigslist city you want to search
* HTML_DIR -- the location of the saved HTML files
"""
SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False)
if SEARCH_NUMBER is False:
sys.stderr.write("Unable to search without a phone number.\n\n")
sys.stderr.write("Please re-run like this:\n")
sys.stderr.write(" PHONE_NUMBER=5553334444 python scraper.py\n\n")
sys.exit(-1)
CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles")
BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY)
BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST)
DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes"))
@asyncio.coroutine
def get(url):
response = yield from aiohttp.request('GET', url)
return (yield from response.read_and_close())
def out(s):
sys.stdout.write(s)
sys.stdout.flush()
def find_bikes():
files = os.listdir(DIRECTORY)
print("Checking %d files" % len(files))
for file in files:
with open(os.path.join(DIRECTORY, file)) as f:
doc = pq(f.read())
possible_links = doc.find(".content .row a.hdrlnk")
for link in possible_links:
del link.attrib["class"]
link.attrib["name"] = link.text.strip()
yield link.attrib
@asyncio.coroutine
def lookup_reply(bike):
url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST,
id=bike["data-id"])
with (yield from sem):
page = yield from get(url)
doc = pq(page)
try:
phone = doc.find(".reply_options > ul")[2].find("li")
except IndexError:
out("-")
return False
if phone.text is None:
out("-")
return False
phone = (phone.text[1:] # Strip off the telephone
.strip() # Clear whitespce
.replace("-", "")) # Make it a solid number
out(".")
if phone == SEARCH_NUMBER:
out("!X!")
return bike
return False
@asyncio.coroutine
def process_bikes(bikes):
matches = []
coroutines = [lookup_reply(bike) for bike in bikes]
for coroutine in asyncio.as_completed(coroutines):
result = yield from coroutine
if result is False:
continue
matches.append(result)
print()
print("Found %d matches" % len(matches))
print("-" * 80)
for match in matches:
print(match)
if __name__ == "__main__":
bikes = find_bikes()
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(process_bikes(bikes))
except KeyboardInterrupt:
print("Ctrl+C caught, stopping")
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment