Skip to content

Instantly share code, notes, and snippets.

@leejh3224
Created May 9, 2018 07:38
Show Gist options
  • Save leejh3224/791ae81b1d406311d412d65e1b763ffd to your computer and use it in GitHub Desktop.
Save leejh3224/791ae81b1d406311d412d65e1b763ffd to your computer and use it in GitHub Desktop.
const axios = require('axios')
const cheerio = require('cheerio')
const {
URL
} = require('url')
const fs = require('fs')
const getSelectors = require('utils/getSelectors')
const sanitizeHtml = require('utils/sanitizeHtml')
const fixImageSource = require('utils/fixImageSource')
class Parser {
constructor({
url,
cheerioOptions
}) {
this.url = url
this.options = cheerioOptions
this.selectors = getSelectors(url)
}
async loadCheerio() {
try {
const {
data
} = await axios.get(this.url)
return cheerio.load(data, this.cheerioOptions)
} catch (error) {
return null
}
}
// parsePage는 게시판의 링크를 parse
// parsePost는 각 게시물을 parse
async parsePage() {
const $ = await this.loadCheerio()
if (!$) {
return null
}
const links = []
const linksInBody = $(this.selectors.link)
// 인스티즈의 경우 제목이 TEXTHEAD) 제목 이런 식으로 구성되므로 TEXTHEAD에 해당되는 링크를 제거
linksInBody.remove('.texthead')
linksInBody.each((i, el) => {
const href = $(el).attr('href')
const {
origin
} = new URL(this.url)
// 인스티즈의 경우 href에 ../fan=12323와 같은 상대 경로 사용
// new URL(href, base?)을 통해 절대경로로 변환
const {
href: absoluteUrl
} = new URL(href, origin)
links.push(absoluteUrl)
})
return {
links,
}
}
async parsePost() {
const $ = await this.loadCheerio()
if (!$) {
return null
}
const imageTags = $(`${this.selectors.body} img`)
if (imageTags.length > Number(process.env.MAX_IMAGES_PER_POST)) {
return null
}
const images = fixImageSource({
url: this.url,
// cheerio로 wrap
images: imageTags.toArray().map(el => $(el)),
})
const author = $(this.selectors.author).text()
const body = sanitizeHtml($(this.selectors.body).html())
const title = $(this.selectors.title).text()
return {
author,
body,
title,
images,
}
}
}
module.exports = Parser
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment