Skip to content

Instantly share code, notes, and snippets.

@danny460
Created April 15, 2022 17:12
Show Gist options
  • Save danny460/4dada06de7cd5aae7e2371b9fab1cab7 to your computer and use it in GitHub Desktop.
Save danny460/4dada06de7cd5aae7e2371b9fab1cab7 to your computer and use it in GitHub Desktop.
Scraping PropertyGuru with puppeteer because searching sucks
const puppeteer = require('puppeteer');
const converter = require('json-2-csv');
const fs = require('fs');
const readline = require('readline');
(async () => {
let allListings = [];
let offset = 1;
while (true) {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto(`https://www.propertyguru.com.sg/condo-directory/search-project/${offset}?limit=100&market=residential&newProject=all`);
const trapped = await page.evaluate(isTrapped);
if (trapped) {
await askInput('We\'re trapped by bot detection. Press enter after manual captcha challenge passed');
}
const listings = await page.evaluate(getListings);
allListings = [...allListings, ...listings];
const completed = await page.evaluate(isComplete);
if (completed) {
break;
}
console.log({
listings, offset
})
offset++;
await browser.close();
await delay(1000)
}
await saveToCSV(allListings, './output/listings.csv')
})();
function saveToCSV(listings, filePath) {
return new Promise((resolve, reject) => {
converter.json2csv(listings, (err, csv) => {
if (err) {
reject(err);
} else {
fs.writeFileSync(filePath, csv);
resolve();
}
});
});
}
function getListings() {
const divs = document.querySelectorAll('div.listing-card');
function currencyTextToFloat(text) {
return Number(text.replace(/[^0-9.-]+/g, ""));
}
const listings = [...divs].map(d => {
const id = d.getAttribute('data-listing-id');
// parsing name and link to details page
const titleLink = d.querySelector('.nav-link');
const name = titleLink?.title;
const url = titleLink?.href;
// parsing address
const address = d.querySelector('span[itemprop=streetAddress]')?.textContent;
// parsing price range
const priceRangeText = document.querySelector('span.list-price__start')?.textContent ?? '';
const parts = priceRangeText.split('-');
let minPrice = 0, maxPrice = 0;
if (parts.length == 2) {
minPrice = currencyTextToFloat(parts[0]);
maxPrice = currencyTextToFloat(parts[1]);
}
// parsing labels: built year, lease type, property type
const labels = [...d.querySelectorAll('.listing-property-type li span')].map(el => el.textContent);
// parsing developer
const developer = d.querySelector('div.developer__name')?.textContent;
// parsing sell/rent counters
const unitsForSale = Number(d.querySelector('.units_for_sale')?.textContent.split(/\s+/)[0] ?? 0);
const unitsForRent = Number(d.querySelector('.units_for_rent')?.textContent.split(/\s+/)[0] ?? 0);
return {
id, name, url, address, minPrice, maxPrice, labels, developer, unitsForSale, unitsForRent,
};
});
return listings;
}
function isComplete() {
const END_TEXT = 'Oops... No 0 Results';
const searchTitle = document.querySelector('.title.search-title')?.textContent ?? '';
return searchTitle.includes(END_TEXT);
}
function isTrapped() {
const TRAP_TEXT = 'Hi! We just want to make sure you are a human';
const text = document.querySelector('.detailed-text-wrapper h1')?.textContent ?? '';
return text.includes(TRAP_TEXT)
}
function askInput(query) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
return new Promise(resolve => rl.question(query, ans => {
rl.close();
resolve(ans);
}))
}
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time)
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment