Created
November 26, 2019 01:51
-
-
Save ans-4175/45da1c2b1eea988394526f8c25d340c2 to your computer and use it in GitHub Desktop.
Crawlers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const request = require('request'); | |
const Promise = require('bluebird'); | |
const cheerio = require('cheerio'); | |
const fs = require("node-fs-extra"); | |
const ThrottleEngine = require("throttle-exec"); | |
const maximumProcess = 10; | |
const throttleInstanceAuthor = new ThrottleEngine(maximumProcess) | |
const throttleInstanceBooks = new ThrottleEngine(maximumProcess) | |
const punctuationRegEx = /[!-/:-@[-`{-~¡-©«-¬®-±´¶-¸»¿×÷˂-˅˒-˟˥-˫˭˯-˿͵;΄-΅·϶҂՚-՟։-֊־׀׃׆׳-״؆-؏؛؞-؟٪-٭۔۩۽-۾܀-܍߶-߹।-॥॰৲-৳৺૱୰௳-௺౿ೱ-ೲ൹෴฿๏๚-๛༁-༗༚-༟༴༶༸༺-༽྅྾-࿅࿇-࿌࿎-࿔၊-၏႞-႟჻፠-፨᎐-᎙᙭-᙮᚛-᚜᛫-᛭᜵-᜶។-៖៘-៛᠀-᠊᥀᥄-᥅᧞-᧿᨞-᨟᭚-᭪᭴-᭼᰻-᰿᱾-᱿᾽᾿-῁῍-῏῝-῟῭-`´-῾\u2000-\u206e⁺-⁾₊-₎₠-₵℀-℁℃-℆℈-℉℔№-℘℞-℣℥℧℩℮℺-℻⅀-⅄⅊-⅍⅏←-⏧␀-␦⑀-⑊⒜-ⓩ─-⚝⚠-⚼⛀-⛃✁-✄✆-✉✌-✧✩-❋❍❏-❒❖❘-❞❡-❵➔➘-➯➱-➾⟀-⟊⟌⟐-⭌⭐-⭔⳥-⳪⳹-⳼⳾-⳿⸀-\u2e7e⺀-⺙⺛-⻳⼀-⿕⿰-⿻\u3000-〿゛-゜゠・㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉃㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꘍-꘏꙳꙾꜀-꜖꜠-꜡꞉-꞊꠨-꠫꡴-꡷꣎-꣏꤮-꤯꥟꩜-꩟﬩﴾-﴿﷼-﷽︐-︙︰-﹒﹔-﹦﹨-﹫!-/:-@[-`{-・¢-₩│-○-�]|\ud800[\udd00-\udd02\udd37-\udd3f\udd79-\udd89\udd90-\udd9b\uddd0-\uddfc\udf9f\udfd0]|\ud802[\udd1f\udd3f\ude50-\ude58]|\ud809[\udc00-\udc7e]|\ud834[\udc00-\udcf5\udd00-\udd26\udd29-\udd64\udd6a-\udd6c\udd83-\udd84\udd8c-\udda9\uddae-\udddd\ude00-\ude41\ude45\udf00-\udf56]|\ud835[\udec1\udedb\udefb\udf15\udf35\udf4f\udf6f\udf89\udfa9\udfc3]|\ud83c[\udc00-\udc2b\udc30-\udc93]/g; | |
// createDir | |
function createDir(dirName) { | |
return new Promise((resolve) => { | |
if (fs.existsSync(dirName) === false) { | |
fs.mkdir(dirName, (err) => { | |
resolve(dirName); | |
}); | |
} else { | |
resolve(dirName); | |
} | |
}); | |
} | |
// write to file | |
function writeToFile(filePath, content) { | |
return new Promise((resolve, reject) => { | |
console.log('writing to:', filePath); | |
fs.writeFile(filePath, content, (err) => { | |
if (err) { | |
reject(console.error(err)); | |
} | |
resolve(filePath); | |
}); | |
}); | |
} | |
// HOME | |
function getAll() { | |
return new Promise((resolve) => { | |
request({ | |
method: 'GET', | |
url: 'http://kepadapuisi.blogspot.co.id/' | |
}, function(err, response, body) { | |
if (err) return console.error(err); | |
// Tell Cherrio to load the HTML | |
const urls = []; | |
$ = cheerio.load(body); | |
$('#Label1 div ul li').each(function() { | |
const href = $('a', this).attr('href'); | |
if (typeof href !== 'undefined') { | |
if(urls.indexOf(href) === -1) { | |
urls.push(href); | |
} | |
} | |
}); | |
resolve(urls); | |
}); | |
}); | |
} | |
// BOOKS | |
function getAuthors(url) { | |
return new Promise((resolve) => { | |
console.log(`Processing ${url}`); | |
const urlSplit = url.split('/'); | |
const authName = (urlSplit[urlSplit.length - 1] !== '') ? urlSplit[urlSplit.length - 1] : urlSplit[urlSplit.length - 2]; | |
request({ | |
method: 'GET', | |
url: url | |
}, function(err, response, body) { | |
if (err) return console.error(err); | |
// Tell Cherrio to load the HTML | |
const urls = []; | |
$ = cheerio.load(body); | |
$('#Blog1 div.blog-posts.hfeed div').each(function() { | |
const href = $('div div div h3 a', this).attr('href'); | |
if (typeof href !== 'undefined') { | |
if(urls.indexOf(href) === -1) { | |
urls.push(href); | |
} | |
} | |
}); | |
resolve({ | |
urls: urls, | |
authName: authName, | |
}); | |
}); | |
}); | |
} | |
const throttledGetAuthors = throttleInstanceAuthor.wrap(getAuthors); | |
// BOOK | |
function getBook(url, authName) { | |
return new Promise((resolve) => { | |
console.log(`Getting books ${authName}_${url}`); | |
request({ | |
method: 'GET', | |
url: url | |
}, function(err, response, body) { | |
if (err) return console.error(err); | |
// Tell Cherrio to load the HTML | |
const texts = []; | |
$ = cheerio.load(body); | |
$('.post-body div span').each(function() { | |
let text = $(this).html(); | |
if (typeof text !== 'undefined' && (text.indexOf('<') === -1) && (text.indexOf('&#') === -1)) { | |
text = text.trim(); | |
text = text.replace(/(\r\n|\n|\r)/gm," ").replace(punctuationRegEx, '').replace(/(\s){2,}/g, '$1'); | |
if (text !== '') texts.push(text.toLowerCase()); | |
} | |
}); | |
// write to text | |
const urlParts = url.split('/'); | |
const urlEnd = (urlParts[urlParts.length - 1] !== '') ? urlParts[urlParts.length - 1] : urlParts[urlParts.length - 2]; | |
const fileSplit = urlEnd.split('.'); | |
const fileName = fileSplit[0]; | |
const dirName = (authName) ? `${__dirname}/poets/${authName}` : `${__dirname}/poets/`; | |
const filePath = `${dirName}/${fileName}`; | |
if (texts.length) { | |
createDir(dirName).then(() => { | |
resolve(writeToFile(filePath, texts.join("\n"))); | |
}); | |
} else { | |
resolve(filePath); | |
} | |
}); | |
}); | |
} | |
const throttledGetBook = throttleInstanceBooks.wrap(getBook); | |
getAll() | |
.then((urls) => { | |
urls.forEach((url) => { | |
throttledGetAuthors(url) | |
.then((obj) => { | |
const authName = obj.authName; | |
obj.urls.forEach((url) => { | |
throttledGetBook(url, authName); | |
}); | |
}) | |
}); | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment