Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@gifguide2code
Last active August 11, 2018 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gifguide2code/1eda047f657931beea6ef17396c13415 to your computer and use it in GitHub Desktop.
Save gifguide2code/1eda047f657931beea6ef17396c13415 to your computer and use it in GitHub Desktop.
A Python script to scrape the Craigslist apartment listings into an Excel spreadsheet.
import requests
import bs4
import openpyxl
#Functions to remove empty spaces and make a dictionary of listing names/URLs
def strip(txt):
ret=""
for l in txt.split("\n"):
if l.strip()!='':
ret += l + "\n"
return ret
listings = dict()
def addListing(desc, url):
if desc in listings:
print('Duplicate')
else:
listings[desc] = url
def getSearchPageListings(url):
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for a in soup.find_all("a", class_="result-title hdrlnk"):
addListing(a.string, a.get('href'))
#STEP 1: GET RESULTS URLS
a_url='https://miami.craigslist.org/search/mdc/sub?max_price=900&bundleDuplicates=1&hasPic=1&min_price=400'
getSearchPageListings(a_url)
bodysum = dict()
bodyText = dict()
def MineListing(listUrl):
res = requests.get(listUrl)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
desc = strip(soup.get_text())
if (desc.find("google map") == -1):
print('No map.')
else:
bodysum[listUrl] = desc[desc.find("google map")+12:desc.find("QR Code Link to This Post")]
bodyText[listUrl] = desc[desc.find("QR")+25:desc.find("email to friend")]
#STEP 2: GET DESCRIPTIONS FROM THOSE PAGES
for key in listings:
MineListing(listings[key])
#Step 3: PUSH RESULTS INTO EXCEL
wb = openpyxl.Workbook()
ws = wb.create_sheet("Listings", 0)
i=1
for key in bodyText:
ws.cell(row=i, column=2).value = bodysum[key]
ws.cell(row=i, column=3).value = bodyText[key]
i += 1
wb.save("C:/Users/Desktop/Craigslist.xlsx")
print('Done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment