Skip to content

Instantly share code, notes, and snippets.

@TheRockStarDBA
Forked from adrianhorning08/zillowScraper.js
Created September 15, 2023 00:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TheRockStarDBA/51418f18abd1d728a146d8df65ce22d5 to your computer and use it in GitHub Desktop.
Save TheRockStarDBA/51418f18abd1d728a146d8df65ce22d5 to your computer and use it in GitHub Desktop.
Zillow Scraper
async function scrollDown() {
const wrapper = document.querySelector("#search-page-list-container");
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 600;
var timer = setInterval(async () => {
var scrollHeightBefore = wrapper.scrollHeight;
wrapper.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeightBefore) {
totalHeight = 0;
// Calculate scrollHeight after waiting
var scrollHeightAfter = wrapper.scrollHeight;
if (scrollHeightAfter > scrollHeightBefore) {
// More content loaded, keep scrolling
return;
} else {
// No more content loaded, stop scrolling
clearInterval(timer);
resolve();
}
}
}, 400);
});
}
function getListings() {
const listings = [];
const lis = document.querySelectorAll("#search-page-list-container ul li");
for (let i = 0; i < lis.length; i++) {
const listing = lis[i];
// if listing contains class ListItem, then it's a listing
const classes = Array.from(listing.classList).join(" ");
if (classes.includes("ListItem")) {
// get the script tag
const script = listing.querySelector("script");
if (!script) {
continue;
}
// find the ul who has a class that includes StyledPropertyCardHomeDetailsList
const beds = listing
?.querySelector(
'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(1)'
)
?.textContent?.match(/\d+/)?.[0];
const baths = listing
?.querySelector(
'ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(2)'
)
?.textContent?.match(/\d+/)?.[0];
// get the span with data-test=property-card-price
const priceString = listing.querySelector(
'span[data-test="property-card-price"]'
)?.textContent;
const price = Number(priceString.replace(/[^0-9.-]+/g, ""));
const json = JSON.parse(script.textContent);
listings.push({
...json.address,
...json.floorSize,
...json.geo,
priceString: priceString,
price,
beds: beds ? Number(beds) : "",
baths: baths ? Number(baths) : "",
sqft: json?.floorSize?.value
? Number(json?.floorSize?.value.replace(/[^0-9.-]+/g, ""))
: "",
name: json.name,
url: json.url,
});
}
}
return listings;
}
function createCSV(jsonData, fileName) {
// Convert JSON to CSV
const csvData = [];
// Extract the headers
const headers = Object.keys(jsonData[0]);
csvData.push(headers.join(","));
jsonData.forEach((item) => {
const row = [];
for (const key in item) {
if (item.hasOwnProperty(key)) {
if (typeof item[key] === "number") {
row.push(item[key]);
continue;
}
const value = item[key]?.includes(",") ? `"${item[key]}"` : item[key];
row.push(value);
}
}
csvData.push(row.join(","));
});
// Create a Blob containing the CSV data
const csvBlob = new Blob([csvData.join("\n")], {
type: "text/csv;charset=utf-8",
});
// Create a URL for the Blob
const csvUrl = URL.createObjectURL(csvBlob);
// Create a link element
const link = document.createElement("a");
link.href = csvUrl;
link.target = "_blank";
link.download = fileName;
// Append the link to the body
document.body.appendChild(link);
// Trigger a click event on the link
link.click();
// Remove the link and revoke the Blob URL
document.body.removeChild(link);
URL.revokeObjectURL(csvUrl);
}
async function scrapeZillow() {
let page = 1;
const allListings = [];
await scrollDown();
const listings = getListings();
console.log("listings", listings);
allListings.push(...listings);
// a title=Next page
let nextButton = document.querySelector('a[title="Next page"]');
// check aria-disabled
let disabled = nextButton?.getAttribute("aria-disabled");
if (nextButton && disabled !== "true") {
nextButton.click();
}
while (nextButton && disabled !== "true") {
console.log("page", page);
await scrollDown();
const listings = getListings();
console.log(`You scraped ${listings.length} listings!`);
console.log(
`If you need anything scraped, email me: adrian@thewebscrapingguy.com`
);
console.log(
`Check this out for faster, more in depth results: https://apify.com/adrian_horning/best-zillow-scraper`
);
allListings.push(...listings);
nextButton = document.querySelector('a[title="Next page"]');
disabled = nextButton.getAttribute("aria-disabled");
if (disabled === "true") {
break;
}
nextButton.click();
page++;
}
console.log(`Congrats! 🎉 You scraped ${allListings.length} listings!`);
createCSV(allListings, `zillowListings-${new Date().getTime()}.csv`);
}
await scrapeZillow();
@TheRockStarDBA
Copy link
Author

Equivalent python

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import re

def get_listings(soup):
    listings = []
    lis = soup.select("#search-page-list-container ul li")
    for li in lis:
        classes = " ".join(li.get('class', []))
        if "ListItem" not in classes:
            continue
        script = li.find("script")
        if script is None:
            continue
        beds = re.findall(r'\d+', li.select_one('ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(1)').text)[0] if li.select_one('ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(1)') else ''
        baths = re.findall(r'\d+', li.select_one('ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(2)').text)[0] if li.select_one('ul[class*="StyledPropertyCardHomeDetailsList"] li:nth-child(2)') else ''
        price_string = li.select_one('span[data-test="property-card-price"]').text if li.select_one('span[data-test="property-card-price"]') else ''
        price = re.sub('[^0-9.]','', price_string)
        json_data = json.loads(script.text)
        listing_data = {
            **json_data.get('address', {}),
            **json_data.get('floorSize', {}),
            **json_data.get('geo', {}),
            'priceString': price_string,
            'price': price,
            'beds': beds,
            'baths': baths,
            'sqft': re.sub('[^0-9.]','', json_data.get('floorSize',{}).get('value','')),
            'name': json_data.get('name',''),
            'url': json_data.get('url','')
        }
        listings.append(listing_data)
    return listings

def scrape_zillow(url):
    page = 1
    all_listings = []
    while True:
        print(f"Scraping page {page}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        listings = get_listings(soup)
        print(f"Found {len(listings)} listings")
        all_listings.extend(listings)
        next_button = soup.select_one('a[title="Next page"]')
        if next_button is None or next_button.get('aria-disabled') == 'true':
            break
        url = next_button['href']
        page += 1
        time.sleep(2)  # delay to respect the website's robots.txt
    print(f"Scraped {len(all_listings)} listings in total")
    return all_listings

listings = scrape_zillow('https://www.zillow.com/homes/for_sale/')
df = pd.DataFrame(listings)
df.to_csv('zillowListings.csv', index=False)

Note: This code comes with some caveats.

  • Web scraping is subject to the terms and conditions of the website being scraped. Always make sure you're allowed to scrape the site and that your script is in compliance with the website's terms before running it.
  • The nature of web scraping means that this script could break if Zillow updates its website layout or the structure of its HTML.
  • The script doesn't handle errors or edge cases, so you might want to add error handling.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment