tswicegood/environment.yml

## environment.yml
# Conda environment file -- these packages are required for this script to work
name: craigslist
dependencies:
- lxml
- pip
- python=3.4*
- requests
- pip:
  - aiohttp
  - pyquery

## scraper.py
"""
This script lets you scan all of the listings for bikes on a collection of
Craiglist postings to look for matching phone numbers.  Note, this should be
used at your own risk, et cetra, and so on.

To use this, create a directory that contains saved files from Craigslist's
main listing (or search results) for the bike category.   The default is to
use ~/Desktop/bikes/.

Once you've gathered the necessary saved data, you can run this script with
the following command:

    PHONE_NUMBER=5553334444 python scraper.py

Replace the value for PHONE_NUMBER with whatever you are searching for.  I had
a situation where I knew a stolen bike had been listed with a given phone
number, so that's what this searches.  The script could be modified to look at
other values as well.
"""

import aiohttp
import asyncio
import os
import sys

from pyquery import PyQuery as pq
sem = asyncio.Semaphore(2)

"""
## Configuration

These values can be configured via environment variables.  It's required that
you include PHONE_NUMBER, but other values include:

* CURRENT_CITY -- the name of the Craigslist city you want to search
* HTML_DIR -- the location of the saved HTML files
"""
SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False)
if SEARCH_NUMBER is False:
    sys.stderr.write("Unable to search without a phone number.\n\n")
    sys.stderr.write("Please re-run like this:\n")
    sys.stderr.write("    PHONE_NUMBER=5553334444 python scraper.py\n\n")
    sys.exit(-1)
CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles")
BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY)
BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST)
DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes"))


@asyncio.coroutine
def get(url):
    response = yield from aiohttp.request('GET', url)
    return (yield from response.read_and_close())


def out(s):
    sys.stdout.write(s)
    sys.stdout.flush()


def find_bikes():
    files = os.listdir(DIRECTORY)
    print("Checking %d files" % len(files))
    for file in files:
        with open(os.path.join(DIRECTORY, file)) as f:
            doc = pq(f.read())
        possible_links = doc.find(".content .row a.hdrlnk")
        for link in possible_links:
            del link.attrib["class"]
            link.attrib["name"] = link.text.strip()
            yield link.attrib


@asyncio.coroutine
def lookup_reply(bike):
    url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST,
                                             id=bike["data-id"])
    with (yield from sem):
        page = yield from get(url)
    doc = pq(page)

    try:
        phone = doc.find(".reply_options > ul")[2].find("li")
    except IndexError:
        out("-")
        return False
    if phone.text is None:
        out("-")
        return False
    phone = (phone.text[1:]  # Strip off the telephone
             .strip()   # Clear whitespce
             .replace("-", ""))  # Make it a solid number
    out(".")
    if phone == SEARCH_NUMBER:
        out("!X!")
        return bike
    return False


@asyncio.coroutine
def process_bikes(bikes):
    matches = []
    coroutines = [lookup_reply(bike) for bike in bikes]
    for coroutine in asyncio.as_completed(coroutines):
        result = yield from coroutine
        if result is False:
            continue
        matches.append(result)

    print()
    print("Found %d matches" % len(matches))
    print("-" * 80)
    for match in matches:
        print(match)


if __name__ == "__main__":
    bikes = find_bikes()
    try:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(process_bikes(bikes))
    except KeyboardInterrupt:
        print("Ctrl+C caught, stopping")
        sys.exit(0)
	# Conda environment file -- these packages are required for this script to work
	name: craigslist
	dependencies:
	- lxml
	- pip
	- python=3.4*
	- requests
	- pip:
	- aiohttp
	- pyquery
	"""
	This script lets you scan all of the listings for bikes on a collection of
	Craiglist postings to look for matching phone numbers. Note, this should be
	used at your own risk, et cetra, and so on.

	To use this, create a directory that contains saved files from Craigslist's
	main listing (or search results) for the bike category. The default is to
	use ~/Desktop/bikes/.

	Once you've gathered the necessary saved data, you can run this script with
	the following command:

	PHONE_NUMBER=5553334444 python scraper.py

	Replace the value for PHONE_NUMBER with whatever you are searching for. I had
	a situation where I knew a stolen bike had been listed with a given phone
	number, so that's what this searches. The script could be modified to look at
	other values as well.
	"""

	import aiohttp
	import asyncio
	import os
	import sys

	from pyquery import PyQuery as pq
	sem = asyncio.Semaphore(2)

	"""
	## Configuration

	These values can be configured via environment variables. It's required that
	you include PHONE_NUMBER, but other values include:

	* CURRENT_CITY -- the name of the Craigslist city you want to search
	* HTML_DIR -- the location of the saved HTML files
	"""
	SEARCH_NUMBER = os.getenv("PHONE_NUMBER", False)
	if SEARCH_NUMBER is False:
	sys.stderr.write("Unable to search without a phone number.\n\n")
	sys.stderr.write("Please re-run like this:\n")
	sys.stderr.write(" PHONE_NUMBER=5553334444 python scraper.py\n\n")
	sys.exit(-1)
	CURRENT_CITY = os.getenv("CURRENT_CITY", "losangeles")
	BASE_CRAIGSLIST = "http://{city}.craigslist.org".format(city=CURRENT_CITY)
	BIKE_TEMPLATE = "{url}/search/sgv/bik".format(url=BASE_CRAIGSLIST)
	DIRECTORY = os.getenv("HTML_DIR", os.path.expanduser("~/Desktop/bikes"))


	@asyncio.coroutine
	def get(url):
	response = yield from aiohttp.request('GET', url)
	return (yield from response.read_and_close())


	def out(s):
	sys.stdout.write(s)
	sys.stdout.flush()


	def find_bikes():
	files = os.listdir(DIRECTORY)
	print("Checking %d files" % len(files))
	for file in files:
	with open(os.path.join(DIRECTORY, file)) as f:
	doc = pq(f.read())
	possible_links = doc.find(".content .row a.hdrlnk")
	for link in possible_links:
	del link.attrib["class"]
	link.attrib["name"] = link.text.strip()
	yield link.attrib


	@asyncio.coroutine
	def lookup_reply(bike):
	url = "{base}/reply/lax/bik/{id}".format(base=BASE_CRAIGSLIST,
	id=bike["data-id"])
	with (yield from sem):
	page = yield from get(url)
	doc = pq(page)

	try:
	phone = doc.find(".reply_options > ul")[2].find("li")
	except IndexError:
	out("-")
	return False
	if phone.text is None:
	out("-")
	return False
	phone = (phone.text[1:] # Strip off the telephone
	.strip() # Clear whitespce
	.replace("-", "")) # Make it a solid number
	out(".")
	if phone == SEARCH_NUMBER:
	out("!X!")
	return bike
	return False


	@asyncio.coroutine
	def process_bikes(bikes):
	matches = []
	coroutines = [lookup_reply(bike) for bike in bikes]
	for coroutine in asyncio.as_completed(coroutines):
	result = yield from coroutine
	if result is False:
	continue
	matches.append(result)

	print()
	print("Found %d matches" % len(matches))
	print("-" * 80)
	for match in matches:
	print(match)


	if __name__ == "__main__":
	bikes = find_bikes()
	try:
	loop = asyncio.get_event_loop()
	loop.run_until_complete(process_bikes(bikes))
	except KeyboardInterrupt:
	print("Ctrl+C caught, stopping")
	sys.exit(0)