Last active
August 12, 2020 13:03
-
-
Save magician11/a979906401591440bd6140bd14260578 to your computer and use it in GitHub Desktop.
How to grab the page source from any dynamically generated webpage and then process it .
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const CDP = require('chrome-remote-interface'); | |
const chromeLauncher = require('chrome-launcher'); | |
const cheerio = require('cheerio'); | |
(async function() { | |
const launchChrome = () => | |
chromeLauncher.launch({ chromeFlags: ['--disable-gpu', '--headless'] }); | |
const chrome = await launchChrome(); | |
const protocol = await CDP({ port: chrome.port }); | |
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms)); | |
// See API docs: https://chromedevtools.github.io/devtools-protocol/ | |
const { Page, Runtime, DOM } = protocol; | |
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); | |
Page.navigate({ url: 'https://www.sfcinemacity.com/showtime/cinema/9936' }); | |
// wait until the page says it's loaded... | |
Page.loadEventFired(async () => { | |
try { | |
console.log('Page loaded! Now waiting a few seconds for all the JS to load...'); | |
await timeout(3000); // give the JS some time to load | |
console.log('Selecting English..'); | |
// first set the language to English | |
const result = await Runtime.evaluate({ | |
expression: | |
"document.querySelector('.lang-switcher li:nth-of-type(2) a').click()" | |
}); | |
// get the page source | |
const rootNode = await DOM.getDocument({ depth: -1 }); | |
const pageSource = await DOM.getOuterHTML({ | |
nodeId: rootNode.root.nodeId | |
}); | |
protocol.close(); | |
chrome.kill(); | |
// load the page source into cheerio | |
console.log('Processing page source...'); | |
const $ = cheerio.load(pageSource.outerHTML); | |
// perform queries | |
console.log('Getting movie times for', $('.showtime-cinema-name').text()); | |
$('.showtime-box').each((i, movieElement) => { | |
console.log($(movieElement).find('.movie-detail .name').text()); | |
}); | |
} catch (err) { | |
console.log(err); | |
} | |
}); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yes using async/await is the way to go. One of the modules I wrote does it that way. Check out this file. Source code copied below..