Last active
August 11, 2018 16:17
-
-
Save gifguide2code/1eda047f657931beea6ef17396c13415 to your computer and use it in GitHub Desktop.
A Python script to scrape the Craigslist apartment listings into an Excel spreadsheet.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import openpyxl | |
#Functions to remove empty spaces and make a dictionary of listing names/URLs | |
def strip(txt): | |
ret="" | |
for l in txt.split("\n"): | |
if l.strip()!='': | |
ret += l + "\n" | |
return ret | |
listings = dict() | |
def addListing(desc, url): | |
if desc in listings: | |
print('Duplicate') | |
else: | |
listings[desc] = url | |
def getSearchPageListings(url): | |
res = requests.get(url) | |
res.raise_for_status() | |
soup = bs4.BeautifulSoup(res.text, 'html.parser') | |
for a in soup.find_all("a", class_="result-title hdrlnk"): | |
addListing(a.string, a.get('href')) | |
#STEP 1: GET RESULTS URLS | |
a_url='https://miami.craigslist.org/search/mdc/sub?max_price=900&bundleDuplicates=1&hasPic=1&min_price=400' | |
getSearchPageListings(a_url) | |
bodysum = dict() | |
bodyText = dict() | |
def MineListing(listUrl): | |
res = requests.get(listUrl) | |
res.raise_for_status() | |
soup = bs4.BeautifulSoup(res.text, 'html.parser') | |
desc = strip(soup.get_text()) | |
if (desc.find("google map") == -1): | |
print('No map.') | |
else: | |
bodysum[listUrl] = desc[desc.find("google map")+12:desc.find("QR Code Link to This Post")] | |
bodyText[listUrl] = desc[desc.find("QR")+25:desc.find("email to friend")] | |
#STEP 2: GET DESCRIPTIONS FROM THOSE PAGES | |
for key in listings: | |
MineListing(listings[key]) | |
#Step 3: PUSH RESULTS INTO EXCEL | |
wb = openpyxl.Workbook() | |
ws = wb.create_sheet("Listings", 0) | |
i=1 | |
for key in bodyText: | |
ws.cell(row=i, column=2).value = bodysum[key] | |
ws.cell(row=i, column=3).value = bodyText[key] | |
i += 1 | |
wb.save("C:/Users/Desktop/Craigslist.xlsx") | |
print('Done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment