Skip to content

Instantly share code, notes, and snippets.

@sueszli
Last active January 12, 2024 14:23
Show Gist options
  • Save sueszli/aa4ccae0907eecc07e111247a8ba1675 to your computer and use it in GitHub Desktop.
Save sueszli/aa4ccae0907eecc07e111247a8ba1675 to your computer and use it in GitHub Desktop.
rentals.com scraper
const { join } = require("path");
module.exports = {
cacheDirectory: join(__dirname, ".cache", "puppeteer"),
};
import axios from 'axios'
import * as cheerio from 'cheerio'
import { assert, log } from 'console'
import open from 'open'
const main = async () => {
let url = process.argv[2]
assert(process.argv.length !== 2, 'illegal number of arguments')
assert(url, 'missing url as argument')
const links = []
let pageNum = 1
while (true) {
// set link for page through query param, repeat
let pageUrl = url + '&p=' + pageNum++
let htmlStr = await axios.get(pageUrl).then((r) => r.data)
let $ = cheerio.load(htmlStr)
// check if last page
const regex = /"url": "https:\/\/rentals.ca\/.*"/g
const foundStr = htmlStr.match(regex)
const isLastPage = !foundStr || !foundStr.length || foundStr.length < 1
if (isLastPage) {
console.log('reached last page -- validate by checking this link: ' + pageUrl)
break
}
// scrape all links
const found = foundStr.map((s) => s.replace('"url": "', '').replace('"', ''))
const foundUnique = []
found.forEach((f) => {
if (!links.includes(f)) {
foundUnique.push(f)
links.push(f)
}
})
console.log('found ' + foundUnique.length + ' links')
}
// remove links that are not listings
const emitted = []
links.forEach((l) => {
const count = (l.match(/\//g) || []).length
if (count < 4) {
emitted.push(l)
const index = links.indexOf(l)
if (index > -1) {
links.splice(index, 1)
}
}
})
console.log('emitted ' + emitted.length + " links that aren't listings: ", emitted)
console.log('press enter key to open the ' + links.length + ' scraped links')
await new Promise((resolve) => process.stdin.once('data', resolve))
for (let i = 0; i < links.length; i++) {
const l = links[i]
const lurl = new URL(l, url).href
console.log('opening: ' + lurl)
await open(lurl)
}
process.exit(0)
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment