Skip to content

Instantly share code, notes, and snippets.

@ans-4175
Created November 26, 2019 01:51
Show Gist options
  • Save ans-4175/45da1c2b1eea988394526f8c25d340c2 to your computer and use it in GitHub Desktop.
Save ans-4175/45da1c2b1eea988394526f8c25d340c2 to your computer and use it in GitHub Desktop.
Crawlers
const request = require('request');
const Promise = require('bluebird');
const cheerio = require('cheerio');
const fs = require("node-fs-extra");
const ThrottleEngine = require("throttle-exec");
const maximumProcess = 10;
const throttleInstanceAuthor = new ThrottleEngine(maximumProcess)
const throttleInstanceBooks = new ThrottleEngine(maximumProcess)
const punctuationRegEx = /[!-/:-@[-`{-~¡-©«-¬®-±´¶-¸»¿×÷˂-˅˒-˟˥-˫˭˯-˿͵;΄-΅·϶҂՚-՟։-֊־׀׃׆׳-״؆-؏؛؞-؟٪-٭۔۩۽-۾܀-܍߶-߹।-॥॰৲-৳৺૱୰௳-௺౿ೱ-ೲ൹෴฿๏๚-๛༁-༗༚-༟༴༶༸༺-༽྅྾-࿅࿇-࿌࿎-࿔၊-၏႞-႟჻፠-፨᎐-᎙᙭-᙮᚛-᚜᛫-᛭᜵-᜶។-៖៘-៛᠀-᠊᥀᥄-᥅᧞-᧿᨞-᨟᭚-᭪᭴-᭼᰻-᰿᱾-᱿᾽᾿-῁῍-῏῝-῟῭-`´-῾\u2000-\u206e⁺-⁾₊-₎₠-₵℀-℁℃-℆℈-℉℔№-℘℞-℣℥℧℩℮℺-℻⅀-⅄⅊-⅍⅏←-⏧␀-␦⑀-⑊⒜-ⓩ─-⚝⚠-⚼⛀-⛃✁-✄✆-✉✌-✧✩-❋❍❏-❒❖❘-❞❡-❵➔➘-➯➱-➾⟀-⟊⟌⟐-⭌⭐-⭔⳥-⳪⳹-⳼⳾-⳿⸀-\u2e7e⺀-⺙⺛-⻳⼀-⿕⿰-⿻\u3000-〿゛-゜゠・㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉃㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꘍-꘏꙳꙾꜀-꜖꜠-꜡꞉-꞊꠨-꠫꡴-꡷꣎-꣏꤮-꤯꥟꩜-꩟﬩﴾-﴿﷼-﷽︐-︙︰-﹒﹔-﹦﹨-﹫!-/:-@[-`{-・¢-₩│-○-�]|\ud800[\udd00-\udd02\udd37-\udd3f\udd79-\udd89\udd90-\udd9b\uddd0-\uddfc\udf9f\udfd0]|\ud802[\udd1f\udd3f\ude50-\ude58]|\ud809[\udc00-\udc7e]|\ud834[\udc00-\udcf5\udd00-\udd26\udd29-\udd64\udd6a-\udd6c\udd83-\udd84\udd8c-\udda9\uddae-\udddd\ude00-\ude41\ude45\udf00-\udf56]|\ud835[\udec1\udedb\udefb\udf15\udf35\udf4f\udf6f\udf89\udfa9\udfc3]|\ud83c[\udc00-\udc2b\udc30-\udc93]/g;
// createDir
function createDir(dirName) {
return new Promise((resolve) => {
if (fs.existsSync(dirName) === false) {
fs.mkdir(dirName, (err) => {
resolve(dirName);
});
} else {
resolve(dirName);
}
});
}
// write to file
function writeToFile(filePath, content) {
return new Promise((resolve, reject) => {
console.log('writing to:', filePath);
fs.writeFile(filePath, content, (err) => {
if (err) {
reject(console.error(err));
}
resolve(filePath);
});
});
}
// HOME
function getAll() {
return new Promise((resolve) => {
request({
method: 'GET',
url: 'http://kepadapuisi.blogspot.co.id/'
}, function(err, response, body) {
if (err) return console.error(err);
// Tell Cherrio to load the HTML
const urls = [];
$ = cheerio.load(body);
$('#Label1 div ul li').each(function() {
const href = $('a', this).attr('href');
if (typeof href !== 'undefined') {
if(urls.indexOf(href) === -1) {
urls.push(href);
}
}
});
resolve(urls);
});
});
}
// BOOKS
function getAuthors(url) {
return new Promise((resolve) => {
console.log(`Processing ${url}`);
const urlSplit = url.split('/');
const authName = (urlSplit[urlSplit.length - 1] !== '') ? urlSplit[urlSplit.length - 1] : urlSplit[urlSplit.length - 2];
request({
method: 'GET',
url: url
}, function(err, response, body) {
if (err) return console.error(err);
// Tell Cherrio to load the HTML
const urls = [];
$ = cheerio.load(body);
$('#Blog1 div.blog-posts.hfeed div').each(function() {
const href = $('div div div h3 a', this).attr('href');
if (typeof href !== 'undefined') {
if(urls.indexOf(href) === -1) {
urls.push(href);
}
}
});
resolve({
urls: urls,
authName: authName,
});
});
});
}
const throttledGetAuthors = throttleInstanceAuthor.wrap(getAuthors);
// BOOK
function getBook(url, authName) {
return new Promise((resolve) => {
console.log(`Getting books ${authName}_${url}`);
request({
method: 'GET',
url: url
}, function(err, response, body) {
if (err) return console.error(err);
// Tell Cherrio to load the HTML
const texts = [];
$ = cheerio.load(body);
$('.post-body div span').each(function() {
let text = $(this).html();
if (typeof text !== 'undefined' && (text.indexOf('<') === -1) && (text.indexOf('&#') === -1)) {
text = text.trim();
text = text.replace(/(\r\n|\n|\r)/gm," ").replace(punctuationRegEx, '').replace(/(\s){2,}/g, '$1');
if (text !== '') texts.push(text.toLowerCase());
}
});
// write to text
const urlParts = url.split('/');
const urlEnd = (urlParts[urlParts.length - 1] !== '') ? urlParts[urlParts.length - 1] : urlParts[urlParts.length - 2];
const fileSplit = urlEnd.split('.');
const fileName = fileSplit[0];
const dirName = (authName) ? `${__dirname}/poets/${authName}` : `${__dirname}/poets/`;
const filePath = `${dirName}/${fileName}`;
if (texts.length) {
createDir(dirName).then(() => {
resolve(writeToFile(filePath, texts.join("\n")));
});
} else {
resolve(filePath);
}
});
});
}
const throttledGetBook = throttleInstanceBooks.wrap(getBook);
getAll()
.then((urls) => {
urls.forEach((url) => {
throttledGetAuthors(url)
.then((obj) => {
const authName = obj.authName;
obj.urls.forEach((url) => {
throttledGetBook(url, authName);
});
})
});
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment