Created
May 9, 2018 07:38
-
-
Save leejh3224/791ae81b1d406311d412d65e1b763ffd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const axios = require('axios') | |
const cheerio = require('cheerio') | |
const { | |
URL | |
} = require('url') | |
const fs = require('fs') | |
const getSelectors = require('utils/getSelectors') | |
const sanitizeHtml = require('utils/sanitizeHtml') | |
const fixImageSource = require('utils/fixImageSource') | |
class Parser { | |
constructor({ | |
url, | |
cheerioOptions | |
}) { | |
this.url = url | |
this.options = cheerioOptions | |
this.selectors = getSelectors(url) | |
} | |
async loadCheerio() { | |
try { | |
const { | |
data | |
} = await axios.get(this.url) | |
return cheerio.load(data, this.cheerioOptions) | |
} catch (error) { | |
return null | |
} | |
} | |
// parsePage는 게시판의 링크를 parse | |
// parsePost는 각 게시물을 parse | |
async parsePage() { | |
const $ = await this.loadCheerio() | |
if (!$) { | |
return null | |
} | |
const links = [] | |
const linksInBody = $(this.selectors.link) | |
// 인스티즈의 경우 제목이 TEXTHEAD) 제목 이런 식으로 구성되므로 TEXTHEAD에 해당되는 링크를 제거 | |
linksInBody.remove('.texthead') | |
linksInBody.each((i, el) => { | |
const href = $(el).attr('href') | |
const { | |
origin | |
} = new URL(this.url) | |
// 인스티즈의 경우 href에 ../fan=12323와 같은 상대 경로 사용 | |
// new URL(href, base?)을 통해 절대경로로 변환 | |
const { | |
href: absoluteUrl | |
} = new URL(href, origin) | |
links.push(absoluteUrl) | |
}) | |
return { | |
links, | |
} | |
} | |
async parsePost() { | |
const $ = await this.loadCheerio() | |
if (!$) { | |
return null | |
} | |
const imageTags = $(`${this.selectors.body} img`) | |
if (imageTags.length > Number(process.env.MAX_IMAGES_PER_POST)) { | |
return null | |
} | |
const images = fixImageSource({ | |
url: this.url, | |
// cheerio로 wrap | |
images: imageTags.toArray().map(el => $(el)), | |
}) | |
const author = $(this.selectors.author).text() | |
const body = sanitizeHtml($(this.selectors.body).html()) | |
const title = $(this.selectors.title).text() | |
return { | |
author, | |
body, | |
title, | |
images, | |
} | |
} | |
} | |
module.exports = Parser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment