Skip to content

Instantly share code, notes, and snippets.

@luizpvas
Created October 12, 2018 20:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luizpvas/a7a5a096892d666ff61c1cbc925c8f74 to your computer and use it in GitHub Desktop.
Save luizpvas/a7a5a096892d666ff61c1cbc925c8f74 to your computer and use it in GitHub Desktop.
Extração de comentários do Podcast Hipsters.tech
const puppeteer = require('puppeteer');
const fs = require('fs');
// Ids of the episodes. Each ID must be passed to the URL in the format of
// hipsters.tech?page_id=<page_id>.
//
// I grabbed these ids calling `Array.from(document.querySelectorAll('#page_id option')).map(option => option.value)`
// in any episode page. The episodes are listed in a select input, which was really handy.
let pageIds = ["1940","1932","1915","1923","1910","1901","1893","1884","1877","1868","1864","1854","1847","1823","1800","1791","1785","1763","1757","1749","1740","1736","1729","1718","1708","1697","1689","1684","1676","1661","1646","1642","1635","1623","1614","1591","1586","1571","1548","1526","1508","1475","1468","1445","1416","1404","1391","1371","1355","1339","1326","1298","1281","1262","1245","1067","1040","1141","1129","1115","1005","1058","1045","1030","1015","1007","984","989","963","948","939","930","923","917","887","878","871","866","859","835","827","816","806","793","778","670","680","689","718","685","683","667","639","638","631","614","566","579","471","558","469","467","521","509","491","449","394","428","412","373","372","330","332","315","307","271","263","223"];
// Run the crawler for each page
(async () => {
for(let i = 0; i < pageIds.length; i++) {
console.log(`Scraping page ${i} / ${pageIds.length}`)
try {
await scrapePage(pageIds[i])
} catch(err) {
console.log(err)
}
}
})();
// Run the scrapper
async function scrapePage(pageId) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.setViewport({width: 1024, height: 800})
console.log(`Visiting episode ${pageId}...`)
await page.goto('https://hipsters.tech?page_id=' + pageId)
// Scroll to the comments element so Disqus renders the iframe
await page.evaluate(async () => {
let commentsContainer = document.querySelector('#comments')
let scrollTop = commentsContainer.getBoundingClientRect().top
window.scrollTo(0, scrollTop)
})
// Wait for the iframe to become visible
await page.waitForSelector('#comments iframe');
// Get the iframe's URL, this is where we'll extract the comments from.
let { disqusUrl, title, tags, participants } = await page.evaluate(() => {
let disqusUrl = document.querySelector('#comments iframe').getAttribute('src')
let title = document.querySelector('.post-title h1').innerText
let tags = Array.from(document.querySelectorAll('.post-meta li>a')).filter(link => {
return link.href.indexOf('category') !== -1
}).map(link => {
return link.innerText
})
let participants = Array.from(
document.querySelectorAll('.entry-content ul')[1].querySelectorAll('li')
).map(li => {
return li.innerText.split(',')[0]
})
return { disqusUrl, title, tags, participants }
})
// Visit the Disqus page and wait for the posts to be rendered
await page.goto(disqusUrl)
await page.waitForSelector('.post-list')
while(true) {
console.log('Loading comments...')
await page.waitFor(5000)
let hasMoreComments = await page.evaluate(() => {
let loadMoreButton = document.querySelector('[data-action="more-posts"]')
if(loadMoreButton && loadMoreButton.offsetParent) {
loadMoreButton.click()
return true
}
})
if(!hasMoreComments) {
break
}
}
// Grab the comments + authors
let comments = await page.evaluate(() => {
return Array.from(document.querySelectorAll('li.post')).map(post => {
return {
postId: post.id,
username: post.querySelector('a[data-username]').getAttribute('data-username'),
authorName: post.querySelector('.author').innerText,
comment: post.querySelector('.post-message').innerText,
}
})
})
console.log(`Found ${comments.length} comments for the episode ${title}`)
fs.writeFileSync('./episodes/' + pageId, JSON.stringify({ title, comments, tags, participants }))
console.log("ALL DONE!")
await browser.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment