Skip to content

Instantly share code, notes, and snippets.

@rcdilorenzo
Created March 16, 2019 18:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcdilorenzo/8c2004ed29b06fd73a4d3ddb1c3e0a50 to your computer and use it in GitHub Desktop.
Save rcdilorenzo/8c2004ed29b06fd73a4d3ddb1c3e0a50 to your computer and use it in GitHub Desktop.
A puppeteer script to download all discussions for a given WorldClass topic (assumes WORLDCLASS_DOMAIN environment is set)
const puppeteer = require('puppeteer');
const read = require('read');
const htmlToText = require('html-to-text').fromString;
const R = require('ramda');
const Promise = require('bluebird');
const fs = require('fs');
const download = require('@jinphen/download2');
const { CookieJar } = require('tough-cookie');
const mapSeries = R.flip(Promise.mapSeries);
const extractPosts = () => {
return Array.from(
document.getElementsByClassName('d2l-datalist-item d2l-datalist-simpleitem')
).map(section => {
const link = section.querySelector('.d2l-linkheading-link').href;
const title = section.querySelector('.d2l-linkheading-link').title;
const subtitle = section.querySelector('.d2l-textblock-secondary').innerText;
const components = subtitle.split(' posted ');
const author = components[0];
const postedAt = components[1];
const html = section.querySelector('.d2l-htmlblock').innerHTML;
const attachments = Array
.from(section.querySelectorAll('.d2l-filelink-text'))
.map(node => node.href);
return { title, author, link, postedAt, html, attachments };
});
};
const extractComments = () => {
return Array.from(
document.querySelector('.d2l-datalist').children
).map(section => {
const author = section.querySelector('.d2l-heading').innerText;
const postedAt = section.querySelector('.d2l-fuzzydate').innerText;
const html = section.querySelector('.d2l-htmlblock').innerHTML;
return { author, postedAt, html };
})
};
const readVariable = options => {
return new Promise((resolve, reject) => {
read(options, function(er, value) {
(er || value == '') ? reject(err) : resolve(value);
});
});
}
const interpretHTML = post => {
return R.pipe(
R.assoc('text', htmlToText(post.html, { wordwrap: 100 })),
R.omit(['html'])
)(post);
};
const downloadAttachment = R.curry((outputFolder, cookies, author, url) => {
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
const authorSlug = author.toLowerCase().replace(' ', '-');
const folder = `${outputFolder}/attachments/${authorSlug}`;
return download(url, folder, { headers: { 'Cookie': cookieHeader } })
.then(({ data, filename }) => `${folder}/${filename}`)
.then(R.tap(filename => console.log(`Downloaded ${filename}`)));
});
(async () => {
const username = await readVariable({ prompt: 'Username: ' });
const password = await readVariable({
prompt: 'Password [not shown]: ',
silent: true
});
const topicURL = await readVariable({ prompt: 'Topic URL: ' });
const topicName = await readVariable({ prompt: 'Topic Name: ' });
const folder = await readVariable({ prompt: 'Folder Name: ' });
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log('Logging in...');
await page.goto(
`https://${process.env.WORLDCLASS_DOMAIN}`,
{ waitUntil: 'networkidle2' }
);
await page.type('#username', username);
await page.type('#password', password);
await page.click('[name="Login"]');
console.log('Extracting posts...')
await page.waitForSelector('.course-image-container');
await page.goto(topicURL, { waitUntil: 'networkidle2' });
const posts = await page.evaluate(extractPosts)
.then(R.map(interpretHTML))
.then(R.tap(_ => console.log('Extracting comments...')))
.then(mapSeries(async (post) => {
// Download attachments
const cookies = await page.cookies();
const attachments = await mapSeries(
downloadAttachment(folder, cookies, post.author),
post.attachments
);
// Download comments
console.log(`Getting comments for "${post.title}"...`)
await page.goto(post.link, { waitUntil: 'networkidle2' });
const comments = await page.evaluate(extractComments)
.then(R.map(interpretHTML));
return { ...post, comments, attachments };
}));
const filename = `${folder}/${topicName}-topic.json`;
fs.writeFileSync(filename, JSON.stringify(posts, null, 2));
console.log(`Topic saved to ${filename}`);
await browser.close();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment