Skip to content

Instantly share code, notes, and snippets.

@Astro36
Last active August 15, 2018 12:52
Show Gist options
  • Save Astro36/aac89d7ef1be5cab53cff9d88c6e0a23 to your computer and use it in GitHub Desktop.
Save Astro36/aac89d7ef1be5cab53cff9d88c6e0a23 to your computer and use it in GitHub Desktop.
Naver News Crawler for Node.js
const fs = require('fs');
const jsdom = require('jsdom');
const parallel = require('parallel-tasks');
const path = require('path');
const request = require('request');
const { JSDOM } = jsdom;
const formatDate = (date) => `${date.getFullYear()}${(date.getMonth() + 1).toString().padStart(2, '0')}${date.getDate().toString().padStart(2, '0')}`;
const NewsType = {
경향신문: 32,
국민일보: 5,
동아일보: 20,
문화일보: 21,
서울신문: 81,
세계일보: 22,
조선일보: 23,
중앙일보: 25,
한겨레: 28,
한국일보: 469,
};
class NewsCrawler {
static getLastestArticleId(type) {
return new Promise((resolve, reject) => {
const typeId = NewsType[type].toString().padStart(3, '0');
request.get(`http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&oid=${typeId}&listType=title&date=${formatDate(new Date())}`, async (err, httpResponse, body) => {
if (err) {
reject(err);
} else {
const { document } = (new JSDOM(body)).window;
resolve(Number(document.querySelector('.list_body > ul.type02 > li > a').href.split('aid=')[1]));
}
});
});
}
static run(type, articleId, articleAmount = 100) {
return new Promise(async (resolve) => {
const typeId = NewsType[type].toString().padStart(3, '0');
const newsDir = path.join(__dirname, type);
if (!fs.existsSync(newsDir)) {
fs.mkdirSync(newsDir);
}
const urls = Array.apply(null, { length: articleAmount }).map((value, index) => value = `http://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=${typeId}&aid=${(articleId - index).toString().padStart(10, '0')}`);
const createTask = (url, index) => () => new Promise((resolve2) => {
console.log(url)
request.get({ url, encoding: null }, (err, httpResponse, body) => {
if (!err) {
const { document } = (new JSDOM(body)).window;
const titleElement = document.querySelector('.article_header > .article_info > #articleTitle');
const contentElement = document.querySelector('#articleBody > #articleBodyContents');
if (titleElement && contentElement) {
const title = titleElement.innerHTML.trim();
const content = contentElement.innerHTML
.replace(/<!--.+-->/g, '')
.replace(/\/\/[^\n]*\n/, '')
.replace(/(?:<br>)+/g, '\n')
.replace(/<(?:.|\n)*?>/gm, '')
.replace('function _flash_removeCallback() {}', '')
.trim();
fs.writeFileSync(path.join(newsDir, `news${index}.json`), JSON.stringify({ title, content }));
}
}
resolve2();
});
});
await parallel.run(urls.map(createTask));
resolve(true);
});
}
}
(async () => {
// const articleId = await NewsCrawler.getLastestArticleId('경향신문');
// console.log(articleId);
await NewsCrawler.run('경향신문', 2854930, 1000);
})()
module.exports = NewsCrawler;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment