Skip to content

Instantly share code, notes, and snippets.

@yangfch3
Created September 13, 2018 01:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yangfch3/db008c851e548e2ffb59d1b1f4064218 to your computer and use it in GitHub Desktop.
Save yangfch3/db008c851e548e2ffb59d1b1f4064218 to your computer and use it in GitHub Desktop.
[Chromeless 豆瓣数据抓取脚本] 以抓取一本书的猜你喜欢书籍列表信息为例 #crawl #douban
const {
Chromeless
} = require('chromeless')
const fs = require('fs')
const myConsole = console
const bookName = process.argv[2]
const bookInfoList = []
if (!bookName) {
myConsole.log('未输入正确的书籍名称')
return
}
const chromeless = new Chromeless({
waitTimeout: 30000
})
async function crawlRelatedBookUrls(bookName) {
let bookUrl = await chromeless.goto(`https://book.douban.com/subject_search?search_text=${encodeURIComponent(bookName)}&cat=1001`).wait(1000).evaluate(() => {
let dom = document.querySelector('#root div div div .item-root img').parentElement
return dom.getAttribute('href')
})
let relatedBookUrls = await chromeless.goto(bookUrl).wait(5000).evaluate(() => {
relatedBookUrls = Array.prototype.map.call(document.querySelectorAll('#db-rec-section .content dl dt a'), (item) => {
return item.getAttribute('href')
})
return relatedBookUrls
})
return relatedBookUrls
}
async function crawlBookInfo(url) {
let bookInfo = await chromeless.goto(url).wait(1000).evaluate(() => {
let bookInfo = {}
let nameDom = document.querySelector('#wrapper > h1 > span')
bookInfo.name = nameDom ? nameDom.innerText : ''
let authorDom = document.querySelector('#info > a:nth-child(2)')
bookInfo.author = authorDom ? authorDom.innerText.replace(/[\\n\n\s]+/, '') : ''
let scoreNumDom = document.querySelector('#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span')
bookInfo.scoreNum = scoreNumDom ? scoreNumDom.innerText : ''
let scoreDom = document.querySelector('#interest_sectl > div > div.rating_self.clearfix > strong')
bookInfo.score = scoreDom ? scoreDom.innerText : ''
let categoryDom = document.querySelectorAll('#db-tags-section > div > span > a')
bookInfo.category = categoryDom ? [...categoryDom].reduce((pv, cv, ci) => {
return pv + '|' + cv.innerText
}, '') : ''
let bookInfoMixin = document.querySelector('#info').innerText
let searchResult = /出版[\w\W]+?(\d+-?\d)/.exec(bookInfoMixin)
bookInfo.pubTime = searchResult[1] || ''
return bookInfo
})
bookInfoList.push(bookInfo)
}
async function exec() {
let relatedBookUrls = await crawlRelatedBookUrls(bookName)
let len = relatedBookUrls.length
for (let i = 0; i < len; i++) {
await crawlBookInfo(relatedBookUrls[i])
}
return bookInfoList
}
exec().then((res) => {
myConsole.log(res)
fs.writeFileSync('output.json', JSON.stringify(res, null, 4))
chromeless.end()
process.exit()
}).catch((e) => {
myConsole.log(e)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment