Skip to content

Instantly share code, notes, and snippets.

@jsks
Last active July 26, 2016 12:16
Show Gist options
  • Save jsks/6b71078aecf8516c6246aed5b745013b to your computer and use it in GitHub Desktop.
Save jsks/6b71078aecf8516c6246aed5b745013b to your computer and use it in GitHub Desktop.
'use strict'
/*
* blocket.js
* Webscrapes blocket apt ads and emails results
*
* Files
* email_template.html: html template for results email
* filter.txt: newline deliminated list of regex keywords to filter out ads by
*
* Config Vars (set through env)
* SENDADDRESS: email address to send from using smtp
* PASSWORD: password for send address
* RECIPIENTS: comma deliminated list of recipients for scraping results
*/
const https = require('https'),
fs = require('fs'),
cheerio = require('cheerio'),
Mustache = require('mustache'),
nodemailer = require('nodemailer'),
juice = require('juice')
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'
function get(options) {
return new Promise((pass, fail) => {
https.get(options, res => {
if (res.statusCode != 200)
fail(res.statusCode)
res.setEncoding('utf8')
let data = ''
res.on('data', chunk => data += chunk)
res.on('end', () => pass(data))
}).on('error', fail)
})
}
function parseStyle(str = '') {
const match = str.match(/\(.*\)/)
return (match) ? match[0].slice(1, -1) : ''
}
function parseItem($, e) {
return new Promise((pass, fail) => {
try {
const item = {
id: $(e).attr('id'),
desc: $(e).find('h4').text().trim(),
link: $(e).find('a').attr('href'),
imgLink: parseStyle($(e).find('a').attr('style')),
rooms: $(e).find('span.rooms').text(),
rent: $(e).find('span.monthly_rent').text(),
size: $(e).find('span.size').text(),
date: new Date($(e).find('time').attr('datetime'))
}
pass(item)
} catch (err) {
fail(err)
}
})
}
function parseHTML(html) {
const $ = cheerio.load(html)
return Promise.all($('.item_row').toArray().map(n => parseItem($, n)))
}
function isFiltered(filterlist, n) {
if (filterlist.length == 0)
return n
const opts = {
host: 'www.blocket.se',
path: n.link.replace(/.*[.]se/, "")
}
return get(opts)
.then(html => {
const $ = cheerio.load(html),
str = $('p.object-text').text().trim().toLowerCase()
n.matched = filterlist.some(r => r.test(str) ||
r.test(n.desc.toLowerCase())) ||
(n.rooms && Number(n.rooms.match(/\d*/)[0]) === 1)
return n
})
}
function isFile(f) {
try {
return fs.lstatSync(f).isFile()
} catch(e) {
return false
}
}
// Check if file exists
function load(f) {
return (!isFile(f))
? fs.writeFileSync(f, '') && ''
: fs.readFileSync(f, 'utf8')
}
function save(f, json) {
fs.writeFileSync(f, `${JSON.stringify(json)}\n`)
}
function buildTemplate(items) {
// This really should be cached
const template = fs.readFileSync(`${__dirname}/email_template.html`, 'utf8'),
sortedItems = items.sort((a, b) => b.date - a.date)
return Mustache.render(template, {
items: sortedItems,
dateNow: () => new Date().toLocaleTimeString(),
parseDate: function() {
return this.date.toLocaleString('se-sv')
}
})
}
function sendMail(recipients, from, pass, msg) {
const url = `smtp://${from}:${pass}@smtp.gmail.com`
const sender = nodemailer.createTransport(url),
juiceOpts = {
webResources: {
links: true,
images: false,
scripts: false
}
}
return new Promise((pass, fail) => {
juice.juiceResources(msg, juiceOpts, (err, html) => {
const opts = {
from: `"Blocket Script" <${from}>`,
to: recipients.join(','),
subject: 'Fresh apartments!',
html
}
sender.sendMail(opts, (err, info) => {
if (err)
fail(err)
pass(info)
})
})
})
}
function assertConfig(x, str) {
if (!x) {
console.error(`Missing env variable: ${str}`)
process.exit(1)
}
return x
}
const sendAddress = assertConfig(process.env.SENDADDRESS, "SENDADDRESS"),
pass = assertConfig(process.env.PASSWORD, "PASSWORD"),
recipients = assertConfig(process.env.RECIPIENTS.split(','), "RECIPIENTS")
const filterlist = load(`${__dirname}/filter.txt`)
.split('\n')
.filter(n => n.length > 0)
.map(n => new RegExp(n)),
isShortTerm = isFiltered.bind(null, filterlist)
const opts = {
host: 'www.blocket.se',
path: '/bostad/uthyres?cg_multi=3020&w=115&m=179',
headers: { 'user-agent': userAgent }
}
const json = JSON.parse(load(`${__dirname}/store.json`) || [])
get(opts)
.then(parseHTML)
.then(items => items.filter(n => json.indexOf(n.id) == -1))
.then(newItems => Promise.all(newItems.map(isShortTerm)))
.then(newItems => newItems.filter(n => !n.matched))
.then(newItems => {
if (newItems.length > 0) {
sendMail(recipients, sendAddress, pass, buildTemplate(newItems))
save(`${__dirname}/store.json`, json.concat(newItems.map(n => n.id)))
}
})
.catch(console.error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment