Skip to content

Instantly share code, notes, and snippets.

@magician11
Last active August 12, 2020 13:03
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save magician11/a979906401591440bd6140bd14260578 to your computer and use it in GitHub Desktop.
Save magician11/a979906401591440bd6140bd14260578 to your computer and use it in GitHub Desktop.
How to grab the page source from any dynamically generated webpage and then process it .
const CDP = require('chrome-remote-interface');
const chromeLauncher = require('chrome-launcher');
const cheerio = require('cheerio');
(async function() {
const launchChrome = () =>
chromeLauncher.launch({ chromeFlags: ['--disable-gpu', '--headless'] });
const chrome = await launchChrome();
const protocol = await CDP({ port: chrome.port });
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol;
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);
Page.navigate({ url: 'https://www.sfcinemacity.com/showtime/cinema/9936' });
// wait until the page says it's loaded...
Page.loadEventFired(async () => {
try {
console.log('Page loaded! Now waiting a few seconds for all the JS to load...');
await timeout(3000); // give the JS some time to load
console.log('Selecting English..');
// first set the language to English
const result = await Runtime.evaluate({
expression:
"document.querySelector('.lang-switcher li:nth-of-type(2) a').click()"
});
// get the page source
const rootNode = await DOM.getDocument({ depth: -1 });
const pageSource = await DOM.getOuterHTML({
nodeId: rootNode.root.nodeId
});
protocol.close();
chrome.kill();
// load the page source into cheerio
console.log('Processing page source...');
const $ = cheerio.load(pageSource.outerHTML);
// perform queries
console.log('Getting movie times for', $('.showtime-cinema-name').text());
$('.showtime-box').each((i, movieElement) => {
console.log($(movieElement).find('.movie-detail .name').text());
});
} catch (err) {
console.log(err);
}
});
})();
@magician11
Copy link
Author

@PatrickHeneise
Copy link

Thanks, that's a great starting point!

Is there a way to wait until the process is done? I added a return Promise.resolve() at the end and return Page.loadEventFired..., but that doesn't seem to work.

@PatrickHeneise
Copy link

PatrickHeneise commented Aug 10, 2017

example that waits until return:

const CDP = require('chrome-remote-interface')

async function x () {
  let protocol
  try {
    protocol = await CDP()

    const timeout = ms => new Promise(resolve => setTimeout(resolve, ms))

    // See API docs: https://chromedevtools.github.io/devtools-protocol/
    const { Page, Runtime, DOM } = protocol
    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()])

    Page.navigate({ url: 'http://example.com' })

    // wait until the page says it's loaded...
    await Page.loadEventFired()
    console.log('Page loaded! Now waiting a few seconds for all the JS to load...')
    await timeout(3000) // give the JS some time to load

    protocol.close()

    console.log('Processing page source...')

    console.log('Doing some fancy stuff here ...')

    console.log('All done.')
    return Promise.resolve()
  } finally {
    if (protocol) {
      protocol.close()
    }
  }
}

(async function () {
  console.log('start')
  await x()
  console.log('end')
})()

by https://stackoverflow.com/a/45589001/459329

@magician11
Copy link
Author

Yes using async/await is the way to go. One of the modules I wrote does it that way. Check out this file. Source code copied below..

const CDP = require('chrome-remote-interface');
const chromeLauncher = require('chrome-launcher');
const cheerio = require('cheerio');

const timeout = ms =>
  new Promise(resolveTimeout => setTimeout(resolveTimeout, ms));

const getShowtimes = async (movieTheatreId, dayOffset = 0) => {
  try {
    // First scrape the showtime data using Google Chrome from the SF Cinemacity website
    const launchChrome = () =>
      chromeLauncher.launch({
        chromeFlags: ['--disable-gpu', '--headless', '--no-sandbox']
      });

    const chrome = await launchChrome();
    const protocol = await CDP({ port: chrome.port });

    // See API docs: https://chromedevtools.github.io/devtools-protocol/
    const { Page, Runtime, DOM } = protocol;
    await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);

    await Page.navigate({
      url: `https://www.sfcinemacity.com/showtime/cinema/${movieTheatreId}`
    });

    // wait until the page says it's loaded...
    await Page.loadEventFired();
    await timeout(3000); // give the JS some time to load

    // first set the language option to English, to convert the content to English
    await Runtime.evaluate({
      expression:
        "document.querySelector('.lang-switcher li:nth-of-type(2) a').click()"
    });

    // click the date we want to get showtimes for
    await Runtime.evaluate({
      expression: `document.querySelector('[data-slick-index="${
        dayOffset
      }"]').click()`
    });

    // get the page source
    const rootNode = await DOM.getDocument({ depth: -1 });
    const pageSource = await DOM.getOuterHTML({
      nodeId: rootNode.root.nodeId
    });
    protocol.close();
    chrome.kill();

    // load the page source into cheerio
    const $ = cheerio.load(pageSource.outerHTML);

    // now process that HTML
    const movieTheatreData = {
      date: $('.slick-slide.selected .date').text(),
      movieTheatreName: $('.showtime-cinema-name').text(),
      movieTheatreId,
      movies: []
    };

    // for each movie showing on this day at this movie theatre..
    $('.showtime-box').each((movieIndex, movieNode) => {
      // collate all the cinemas it's showing at (the showtimes and language per cinema)
      const cinemas = [];
      $(movieNode)
        .find('.showtime-item')
        .each((cinemaIndex, cinemaNode) => {
          cinemas.push({
            language: $(cinemaNode)
              .find('.right-section .list-item')
              .first()
              .text()
              .split(' ')[1]
              .slice(0, -1),
            times: $(cinemaNode)
              .find('.time-list .time-item')
              .map((index, el) => $(el).text())
              .get()
              .join()
          });
        });

      // then finally capture the title, the rating, and the cinema showtimes collated above
      movieTheatreData.movies.push({
        movieTitle: $(movieNode)
          .find('.movie-detail .name')
          .text(),
        rating: $(movieNode)
          .find('.movie-detail .movie-detail-list .list-item')
          .first()
          .text()
          .split('Rate: ')[1],
        cinemas
      });
    });

    return movieTheatreData;
  } catch (err) {
    reject(`Error scraping movie data from SF Cinema City: ${err}`);
  }
};

module.exports = {
  getShowtimes
};

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment