Skip to content

Instantly share code, notes, and snippets.

@emsifa
Created September 16, 2019 11:24
Show Gist options
  • Save emsifa/a6ffb458e75fa8549984c3b83cc79610 to your computer and use it in GitHub Desktop.
Save emsifa/a6ffb458e75fa8549984c3b83cc79610 to your computer and use it in GitHub Desktop.
Puppeteer Crawling Kompas

Cara Pakai

node crawl keywordnya
const puppeteer = require('puppeteer');
const request = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const keyword = process.argv[2];
const width = 1366;
const height = 768;
console.log({keyword});
async function crawlArticles(searchResults) {
const articles = [];
for (let i = 0; i < searchResults.length; i++) {
const data = searchResults[i];
const url = data.url;
console.log('Grab html from: ' + url);
const html = await request(url);
const $ = cheerio.load(html);
const content = $('.read__content').text();
articles.push(content.trim());
}
return articles;
}
async function crawlSearchResults(page) {
console.log('Scraping search results');
const searchResults = await page.evaluate(() => {
const articles = [];
$(".gsc-resultsbox-visible").find('a.gs-title').each(function() {
articles.push({
title: $(this).text(),
url: $(this).attr('href')
})
});
return articles;
});
console.log('Grab contents from search results');
const contents = await crawlArticles(searchResults);
return searchResults.map((article, i) => {
article.content = contents[i];
return article;
});
}
async function hasPage(page, n) {
return page.evaluate((n) => {
const length = $(`.gsc-cursor-page[aria-label="Page ${n}"]`).length;
return length > 0;
}, n);
}
async function delay(duration) {
return new Promise(resolve => {
setTimeout(() => resolve(), duration);
});
}
async function openPage(page, n) {
await page.evaluate((n) => {
$(`.gsc-cursor-page[aria-label="Page ${n}"]`).click();
}, n);
return delay(2000);
}
(async () => {
const browser = await puppeteer.launch({
headless: false, // The browser is visible
ignoreHTTPSErrors: true,
args: [`--window-size=${width},${height}`] // new option
});
const page = await browser.newPage();
await page.setViewport({width: width, height: height});
await page.goto(`https://search.kompas.com/search/?q=${keyword}&submit=Submit+Query`);
let results = [];
let p = 1;
while (true) {
console.log("Crawling Page " + page);
const pageResults = await crawlSearchResults(page);
results = results.concat(pageResults);
fs.writeFileSync(`results-${keyword}.json`, JSON.stringify(results, null, 4));
p++;
const hasNext = await hasPage(page, p);
if (!hasNext) {
break;
}
await openPage(page, p);
}
await browser.close();
})();
{
"name": "crawl-kompas-search",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"puppeteer": "^1.20.0",
"request": "^2.88.0",
"request-promise": "^4.2.4"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment