Created
April 15, 2022 17:12
-
-
Save danny460/4dada06de7cd5aae7e2371b9fab1cab7 to your computer and use it in GitHub Desktop.
Scraping PropertyGuru with puppeteer because searching sucks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const converter = require('json-2-csv'); | |
const fs = require('fs'); | |
const readline = require('readline'); | |
(async () => { | |
let allListings = []; | |
let offset = 1; | |
while (true) { | |
const browser = await puppeteer.launch({ headless: false }); | |
const page = await browser.newPage(); | |
await page.goto(`https://www.propertyguru.com.sg/condo-directory/search-project/${offset}?limit=100&market=residential&newProject=all`); | |
const trapped = await page.evaluate(isTrapped); | |
if (trapped) { | |
await askInput('We\'re trapped by bot detection. Press enter after manual captcha challenge passed'); | |
} | |
const listings = await page.evaluate(getListings); | |
allListings = [...allListings, ...listings]; | |
const completed = await page.evaluate(isComplete); | |
if (completed) { | |
break; | |
} | |
console.log({ | |
listings, offset | |
}) | |
offset++; | |
await browser.close(); | |
await delay(1000) | |
} | |
await saveToCSV(allListings, './output/listings.csv') | |
})(); | |
function saveToCSV(listings, filePath) { | |
return new Promise((resolve, reject) => { | |
converter.json2csv(listings, (err, csv) => { | |
if (err) { | |
reject(err); | |
} else { | |
fs.writeFileSync(filePath, csv); | |
resolve(); | |
} | |
}); | |
}); | |
} | |
function getListings() { | |
const divs = document.querySelectorAll('div.listing-card'); | |
function currencyTextToFloat(text) { | |
return Number(text.replace(/[^0-9.-]+/g, "")); | |
} | |
const listings = [...divs].map(d => { | |
const id = d.getAttribute('data-listing-id'); | |
// parsing name and link to details page | |
const titleLink = d.querySelector('.nav-link'); | |
const name = titleLink?.title; | |
const url = titleLink?.href; | |
// parsing address | |
const address = d.querySelector('span[itemprop=streetAddress]')?.textContent; | |
// parsing price range | |
const priceRangeText = document.querySelector('span.list-price__start')?.textContent ?? ''; | |
const parts = priceRangeText.split('-'); | |
let minPrice = 0, maxPrice = 0; | |
if (parts.length == 2) { | |
minPrice = currencyTextToFloat(parts[0]); | |
maxPrice = currencyTextToFloat(parts[1]); | |
} | |
// parsing labels: built year, lease type, property type | |
const labels = [...d.querySelectorAll('.listing-property-type li span')].map(el => el.textContent); | |
// parsing developer | |
const developer = d.querySelector('div.developer__name')?.textContent; | |
// parsing sell/rent counters | |
const unitsForSale = Number(d.querySelector('.units_for_sale')?.textContent.split(/\s+/)[0] ?? 0); | |
const unitsForRent = Number(d.querySelector('.units_for_rent')?.textContent.split(/\s+/)[0] ?? 0); | |
return { | |
id, name, url, address, minPrice, maxPrice, labels, developer, unitsForSale, unitsForRent, | |
}; | |
}); | |
return listings; | |
} | |
function isComplete() { | |
const END_TEXT = 'Oops... No 0 Results'; | |
const searchTitle = document.querySelector('.title.search-title')?.textContent ?? ''; | |
return searchTitle.includes(END_TEXT); | |
} | |
function isTrapped() { | |
const TRAP_TEXT = 'Hi! We just want to make sure you are a human'; | |
const text = document.querySelector('.detailed-text-wrapper h1')?.textContent ?? ''; | |
return text.includes(TRAP_TEXT) | |
} | |
function askInput(query) { | |
const rl = readline.createInterface({ | |
input: process.stdin, | |
output: process.stdout, | |
}); | |
return new Promise(resolve => rl.question(query, ans => { | |
rl.close(); | |
resolve(ans); | |
})) | |
} | |
function delay(time) { | |
return new Promise(function (resolve) { | |
setTimeout(resolve, time) | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment