Skip to content

Instantly share code, notes, and snippets.

@traviskaufman
Created October 31, 2019 01:50
Show Gist options
  • Save traviskaufman/d17c1e9b901b9e91da46185de596c219 to your computer and use it in GitHub Desktop.
Save traviskaufman/d17c1e9b901b9e91da46185de596c219 to your computer and use it in GitHub Desktop.
Timestamp scraping script used for r/dataisbeautiful October 2019 Challenge
const fs = require('fs');
const fetch = require('node-fetch');
const cheerio = require('cheerio');
main().catch(err => console.error(err));
async function main() {
const SCARE_RE = /^((\d{1,2}):(\d{1,2})(?::(\d{1,2}))?).*– (.+)$/;
const $ = cheerio.load(fs.readFileSync('./notebooks/movielist.html', 'utf8'));
const $movieNames = $('td.column-1 > a');
const $diretors = $('td.column-2');
const $jumpCounts = $('td.column-4');
const movieUrls = $movieNames.map(function (i) {
const $el = $(this);
return {
title: $el.text().trim().replace(/\s+/gm, ' '),
director: $diretors.eq(i).text().trim(),
link: $el.attr('href'),
numScares: parseInt($jumpCounts.eq(i).text().trim(), 10),
};
});
const csv = []
const errors = []
for (let i = 0; i < movieUrls.length; i++) {
const movieUrl = movieUrls[i];
if (movieUrl.numScares == 0) {
console.debug(`Skipping ${movieUrl.title} b/c it has no jump scares`);
continue;
}
console.debug('Fetching jump scare info for', movieUrl.title);
const html = await (await fetch(movieUrl.link)).text();
const $ = cheerio.load(html);
const $ps = $('p');
const ptext = $ps.map(function () {
return $(this).text().trim();
}).toArray();
const timestampMatches = ptext.map(t => t.match(SCARE_RE)).filter(Boolean);
if (timestampMatches.length != movieUrl.numScares) {
console.warn('NOTE: for movie', movieUrl.title, `number of timestamps found (${timestampMatches.length}) did not match number of scares specified (${movieUrl.numScares}). Go check this manually`);
errors.push({
'Movie Name': movieUrl.title,
'Director': movieUrl.director,
'Number of timestamps found': timestampMatches.length,
'Number of timestamps specified': movieUrl.numScares,
})
}
for (const m of timestampMatches) {
const ts = formatTimestamp(m[1]);
csv.push({
'Movie Name': movieUrl.title,
'Director': movieUrl.director,
'Timestamp': ts,
'Timestamp Seconds': formattedTimestampToSeconds(ts),
'Description': m[5],
})
}
console.log('Added', timestampMatches.length, 'timestamps from', movieUrl.title);
}
fs.writeFileSync('movietimestamps.json', JSON.stringify(csv, null, 2), 'utf-8');
fs.writeFileSync('errors.json', JSON.stringify(errors, null, 2), 'utf-8');
}
function formatTimestamp(ts) {
// These are only formatted as 'm:ss', 'h:mm:ss', or 'mm:ss'
const parts = ts.split(':');
let hour = '0';
let min = '0';
let sec = '0';
if (parts.length > 2) {
hour = parts[0];
min = parts[1];
sec = parts[2];
} else {
min = parts[0];
sec = parts[1];
}
const joined = [hour, min, sec].map(p => p.length < 2 ? `0${p}` : p).join(':');
return joined;
}
function formattedTimestampToSeconds(ts) {
const [hr, min, sec] = ts.split(':');
return parseInt(hr, 10) * 60 * 60 + parseInt(min, 10) * 60 + parseInt(sec, 10);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment